diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 764a7a9bf5e52..f8fa50a706721 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1780,7 +1780,9 @@ def FeatureISAVersion11_Common : FeatureSet< FeatureImageInsts, FeaturePackedTID, FeatureVcmpxPermlaneHazard, - FeatureMemoryAtomicFAddF32DenormalSupport]>; + FeatureMemoryAtomicFAddF32DenormalSupport, + FeatureRealTrue16Insts +]>; // There are few workarounds that need to be // added to all targets. This pessimizes codegen diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 85276bd24bcf4..ba832e52892d3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7221,24 +7221,44 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) { return DeferredList.contains(MI); } -// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode, -// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add -// subreg access properly. This can be removed after we have sgpr16 in place -void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst, +// legalize operand between 16bit and 32bit registers in v2s copy +// lowering (change spgr to vgpr). +// This is mainly caused by 16bit SALU and 16bit VALU using reg with different +// size. Need to legalize the size of the operands during the vgpr lowering +// chain. This can be removed after we have sgpr16 in place +void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, MachineRegisterInfo &MRI) const { - unsigned Opcode = Inst.getOpcode(); - if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts()) + if (!ST.useRealTrue16Insts()) return; - for (MachineOperand &Op : Inst.explicit_operands()) { + unsigned Opcode = MI.getOpcode(); + MachineBasicBlock *MBB = MI.getParent(); + + // legalize operands and check for size mismatch + for (MachineOperand &Op : MI.explicit_operands()) { unsigned OpIdx = Op.getOperandNo(); if (!OpIdx) continue; - if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) { + if (Op.isReg() && Op.getReg().isVirtual() && RI.isVGPR(MRI, Op.getReg())) { unsigned RCID = get(Opcode).operands()[OpIdx].RegClass; - const TargetRegisterClass *RC = RI.getRegClass(RCID); - if (RI.getRegSizeInBits(*RC) == 16) { + const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID); + const TargetRegisterClass *RC = MRI.getRegClass(Op.getReg()); + if (32 == RI.getRegSizeInBits(*RC) && + 16 == RI.getRegSizeInBits(*ExpectedRC)) { Op.setSubReg(AMDGPU::lo16); + } else if (16 == RI.getRegSizeInBits(*RC) && + 32 == RI.getRegSizeInBits(*ExpectedRC)) { + const DebugLoc &DL = MI.getDebugLoc(); + Register NewDstReg = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef); + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg) + .addReg(Op.getReg()) + .addImm(AMDGPU::lo16) + .addReg(Undef) + .addImm(AMDGPU::hi16); + Op.setReg(NewDstReg); } } } @@ -7783,8 +7803,19 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, .addReg(Undef) .addImm(AMDGPU::hi16); Inst.eraseFromParent(); - MRI.replaceRegWith(DstReg, NewDstReg); + // legalize useMI with mismatched size + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), + E = MRI.use_end(); + I != E; ++I) { + MachineInstr &UseMI = *I->getParent(); + unsigned UseMIOpcode = UseMI.getOpcode(); + if (AMDGPU::isTrue16Inst(UseMIOpcode) && + (16 == + RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) { + I->setSubReg(AMDGPU::lo16); + } + } addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); return; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 1ef7d358d8cae..8ae7b58330256 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <18 x float> @bitcast_v18i32_to_v18f32(<18 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v18i32_to_v18f32: @@ -1227,113 +1228,145 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v18i32_to_v36i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: ; implicit-def: $vgpr19 -; GFX11-NEXT: ; implicit-def: $vgpr18 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB6_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB6_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v18i32_to_v36i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v18i32_to_v36i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -1963,73 +1996,105 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v36i16_to_v18i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v18i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v18i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2610,113 +2675,145 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v18i32_to_v36f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: ; implicit-def: $vgpr19 -; GFX11-NEXT: ; implicit-def: $vgpr18 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB8_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB8_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v18i32_to_v36f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v18i32_to_v36f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3423,73 +3520,105 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v36f16_to_v18i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v18i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v18i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4436,104 +4565,127 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v18f32_to_v36i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: ; implicit-def: $vgpr19 -; GFX11-NEXT: ; implicit-def: $vgpr18 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB14_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB14_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5163,73 +5315,105 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v36i16_to_v18f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB15_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v18f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v18f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5810,104 +5994,127 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v18f32_to_v36f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: ; implicit-def: $vgpr19 -; GFX11-NEXT: ; implicit-def: $vgpr18 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB16_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB16_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6614,73 +6821,105 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v36f16_to_v18f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB17_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v18f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v18f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7361,118 +7600,155 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v9i64_to_v36i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: ; implicit-def: $vgpr19 -; GFX11-NEXT: ; implicit-def: $vgpr18 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB20_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB20_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v9i64_to_v36i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v9i64_to_v36i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8102,73 +8378,105 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v36i16_to_v9i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB21_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v9i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v9i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8749,118 +9057,155 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v9i64_to_v36f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: ; implicit-def: $vgpr19 -; GFX11-NEXT: ; implicit-def: $vgpr18 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB22_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB22_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v9i64_to_v36f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v9i64_to_v36f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9567,73 +9912,105 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v36f16_to_v9i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v9i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v9i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10034,104 +10411,127 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v9f64_to_v36i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: ; implicit-def: $vgpr19 -; GFX11-NEXT: ; implicit-def: $vgpr18 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB24_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB24_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10761,73 +11161,105 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v36i16_to_v9f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v9f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v9f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11372,104 +11804,127 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v9f64_to_v36f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: ; implicit-def: $vgpr19 -; GFX11-NEXT: ; implicit-def: $vgpr18 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB26_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-NEXT: .LBB26_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12176,73 +12631,105 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v36f16_to_v9f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB27_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v9f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v9f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12883,109 +13370,141 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v17, v35, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v36i16_to_v36f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v34, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v32, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v30, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v28, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v26, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v25, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v24, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v21, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v22, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v23, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; GFX11-NEXT: .LBB28_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v21, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v22, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v23, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v24, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v25, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v26, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v28, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v30, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v32, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v34, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v36f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v36f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v34, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v32, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v30, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v28, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v26, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v25, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v24, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v22, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v23, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v22, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v23, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v24, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v25, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v26, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v28, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v30, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v32, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v34, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13530,109 +14049,141 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v17, v35, v17, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v36f16_to_v36i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v34, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v32, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v30, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v28, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v26, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v25, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v24, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v21, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v22, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v23, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; GFX11-NEXT: .LBB29_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v21, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v22, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v23, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v24, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v25, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v26, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v28, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v30, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v32, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v34, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v36i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v36i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v34, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v32, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v30, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v28, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v26, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v25, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v24, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v22, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v23, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v22, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v23, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v24, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v25, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v26, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v28, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v30, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v32, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v34, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 8f9de9e898301..67e035ba7d934 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <20 x float> @bitcast_v20i32_to_v20f32(<20 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v20i32_to_v20f32: @@ -1310,123 +1311,157 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v20i32_to_v40i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB6_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB6_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v20i32_to_v40i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v20i32_to_v40i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2160,79 +2195,113 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v40i16_to_v20i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2879,123 +2948,157 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v20i32_to_v40f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB8_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB8_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v20i32_to_v40f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v20i32_to_v40f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3814,79 +3917,113 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v40f16_to_v20i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4899,113 +5036,137 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v20f32_to_v40i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB14_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB14_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5739,79 +5900,113 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v40i16_to_v20f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB15_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6458,113 +6653,137 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v20f32_to_v40f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB16_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB16_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7383,79 +7602,113 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v40f16_to_v20f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB17_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8188,128 +8441,167 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10i64_to_v40i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB20_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB20_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v10i64_to_v40i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v10i64_to_v40i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9043,79 +9335,113 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v40i16_to_v10i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB21_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9762,128 +10088,167 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10i64_to_v40f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB22_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB22_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v10i64_to_v40f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v10i64_to_v40f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10702,79 +11067,113 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v40f16_to_v10i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11212,113 +11611,137 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10f64_to_v40i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB24_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB24_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12052,79 +12475,113 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v40i16_to_v10f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12731,113 +13188,137 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10f64_to_v40f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: ; implicit-def: $vgpr21 -; GFX11-NEXT: ; implicit-def: $vgpr20 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB26_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-NEXT: .LBB26_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13656,79 +14137,113 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v40f16_to_v10f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB27_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -14482,119 +14997,153 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v19, v39, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v40i16_to_v40f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v37, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v35, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v33, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v31, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v29, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v27, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v26, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v23, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v24, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v25, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: .LBB28_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v23, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v24, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v25, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v26, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v27, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v29, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v31, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v33, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v35, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v37, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v40f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v37, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v35, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v31, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v29, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v27, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v26, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v23, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v24, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v25, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v23, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v24, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v25, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v26, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v27, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v29, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v31, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v35, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v37, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -15206,119 +15755,153 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v19, v39, v19, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v40f16_to_v40i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v37, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v35, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v33, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v31, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v29, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v27, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v26, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v23, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v24, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v25, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: .LBB29_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v23, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v24, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v25, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v26, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v27, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v29, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v31, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v33, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v35, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v37, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v40i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v37, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v35, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v31, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v29, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v27, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v26, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v23, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v24, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v25, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v23, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v24, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v25, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v26, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v27, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v29, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v31, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v35, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v37, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index a0fe407022d81..08590a3af70f5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <22 x float> @bitcast_v22i32_to_v22f32(<22 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v22i32_to_v22f32: @@ -1394,133 +1395,169 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v22i32_to_v44i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB6_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB6_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v22i32_to_v44i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v22i32_to_v44i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2357,85 +2394,121 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v44i16_to_v22i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3167,133 +3240,169 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v22i32_to_v44f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB8_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB8_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v22i32_to_v44f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v22i32_to_v44f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4224,85 +4333,121 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v44f16_to_v22i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5382,122 +5527,147 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v22f32_to_v44i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB14_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB14_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6334,85 +6504,121 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v44i16_to_v22f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB15_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7144,122 +7350,147 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v22f32_to_v44f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB16_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB16_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8190,85 +8421,121 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v44f16_to_v22f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB17_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9054,139 +9321,181 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v11i64_to_v44i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB20_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB20_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v11i64_to_v44i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v11i64_to_v44i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10023,85 +10332,121 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v44i16_to_v11i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB21_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10833,139 +11178,181 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v11i64_to_v44f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB22_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB22_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v11i64_to_v44f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v11i64_to_v44f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11896,85 +12283,121 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v44f16_to_v11i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12449,122 +12872,147 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v11f64_to_v44i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB24_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB24_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13401,85 +13849,121 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v44i16_to_v11f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -14167,122 +14651,147 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v11f64_to_v44f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: ; implicit-def: $vgpr23 -; GFX11-NEXT: ; implicit-def: $vgpr22 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB26_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-NEXT: .LBB26_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -15213,85 +15722,121 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v44f16_to_v11f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB27_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -16159,129 +16704,165 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v51, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v44i16_to_v44f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v48, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v38, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v36, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v34, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v32, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v30, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v28, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v25, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v26, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v27, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GFX11-NEXT: .LBB28_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v25, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v26, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v27, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v28, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v30, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v32, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v34, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v36, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v38, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v48, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v44f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v44f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v48, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v38, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v36, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v32, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v30, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v28, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v25, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v26, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v27, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v25, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v26, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v27, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v28, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v30, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v32, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v36, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v38, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v48, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -16947,129 +17528,165 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v51, v21, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v44f16_to_v44i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v48, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v38, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v36, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v34, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v32, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v30, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v28, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v25, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v26, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v27, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GFX11-NEXT: .LBB29_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v25, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v26, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v27, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v28, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v30, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v32, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v34, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v36, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v38, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v48, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v44i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v44i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v48, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v38, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v36, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v32, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v30, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v28, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v25, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v26, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v27, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v25, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v26, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v27, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v28, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v30, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v32, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v36, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v38, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v48, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 87fa5af74c596..b1a194f8a3a7d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <24 x float> @bitcast_v24i32_to_v24f32(<24 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v24i32_to_v24f32: @@ -1508,143 +1509,181 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v24i32_to_v48i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB6_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB6_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v24i32_to_v48i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v24i32_to_v48i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2579,91 +2618,129 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v48i16_to_v24i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3505,143 +3582,181 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v24i32_to_v48f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB8_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB8_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v24i32_to_v48f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v24i32_to_v48f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4696,91 +4811,129 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v48f16_to_v24i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5957,131 +6110,157 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v24f32_to_v48i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB14_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB14_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7016,91 +7195,129 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v48i16_to_v24f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB15_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7942,131 +8159,157 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v24f32_to_v48f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB16_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB16_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9121,91 +9364,129 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v48f16_to_v24f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB17_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10074,149 +10355,193 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v12i64_to_v48i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB20_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB20_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v12i64_to_v48i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v12i64_to_v48i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11151,91 +11476,129 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v48i16_to_v12i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB21_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12077,149 +12440,193 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v12i64_to_v48f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB22_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB22_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v12i64_to_v48f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v12i64_to_v48f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13274,91 +13681,129 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v48f16_to_v12i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13901,131 +14346,157 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v12f64_to_v48i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB24_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB24_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -14960,91 +15431,129 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v48i16_to_v12f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -15838,131 +16347,157 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v12f64_to_v48f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: ; implicit-def: $vgpr25 -; GFX11-NEXT: ; implicit-def: $vgpr24 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB26_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-NEXT: .LBB26_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -17017,91 +17552,129 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v48f16_to_v12f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB27_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -18080,139 +18653,177 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v55, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v48i16_to_v48f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v54, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v53, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v52, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v51, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v49, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v39, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v37, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v35, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v33, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v31, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v27, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v28, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v29, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v23 -; GFX11-NEXT: .LBB28_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v27, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v28, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v29, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v31, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v33, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v35, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v37, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v39, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v49, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v51, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v52, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v53, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v54, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v48f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v48f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v54, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v53, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v52, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v51, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v49, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v39, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v37, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v33, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v31, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v27, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v28, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v29, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23 +; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v27, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v28, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v29, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v31, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v33, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v37, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v39, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v49, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v51, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v52, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v53, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v54, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -18944,139 +19555,177 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v55, v23, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v48f16_to_v48i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v54, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v53, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v52, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v51, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v49, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v39, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v37, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v35, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v33, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v31, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v27, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v28, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v29, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v23 -; GFX11-NEXT: .LBB29_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v27, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v28, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v29, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v31, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v33, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v35, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v37, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v39, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v49, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v51, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v52, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v53, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v54, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v48i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v48i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v54, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v53, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v52, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v51, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v49, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v39, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v37, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v33, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v31, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v27, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v28, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v29, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v27, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v28, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v29, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v31, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v33, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v37, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v39, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v49, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v51, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v52, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v53, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v54, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index e4f8a96c482c6..75baa36ca3d11 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <26 x float> @bitcast_v26i32_to_v26f32(<26 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v26i32_to_v26f32: @@ -1610,153 +1611,193 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v26i32_to_v52i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB6_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB6_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v26i32_to_v52i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v26i32_to_v52i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2792,97 +2833,137 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v52i16_to_v26i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3823,153 +3904,193 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v26i32_to_v52f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB8_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB8_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v26i32_to_v52f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v26i32_to_v52f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5169,97 +5290,137 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v52f16_to_v26i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6521,140 +6682,167 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v26f32_to_v52i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB14_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB14_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7690,97 +7878,137 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v52i16_to_v26f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB15_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8721,140 +8949,167 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v26f32_to_v52f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB16_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB16_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10054,97 +10309,137 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v52f16_to_v26f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB17_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11084,160 +11379,207 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v13i64_to_v52i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB20_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB20_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v13i64_to_v52i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v13i64_to_v52i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12273,97 +12615,137 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v52i16_to_v13i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB21_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13304,160 +13686,207 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v13i64_to_v52f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB22_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB22_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v13i64_to_v52f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v13i64_to_v52f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -14657,97 +15086,137 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v52f16_to_v13i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -15345,140 +15814,167 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v13f64_to_v52i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB24_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB24_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -16514,97 +17010,137 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v52i16_to_v13f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -17493,140 +18029,167 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v13f64_to_v52f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: ; implicit-def: $vgpr27 -; GFX11-NEXT: ; implicit-def: $vgpr26 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB26_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-NEXT: .LBB26_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -18826,97 +19389,137 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v52f16_to_v13f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB27_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -20030,149 +20633,189 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v52i16_to_v52f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v26 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v66, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v64, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v55, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v54, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v53, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v52, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v50, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v48, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v38, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v36, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v34, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v32, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v29, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v30, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v25 -; GFX11-NEXT: .LBB28_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v29, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v30, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v32, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v34, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v36, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v38, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v48, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v50, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v52, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v53, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v54, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v55, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v64, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v66, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v52f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v52f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v66, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v64, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v55, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v54, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v53, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v52, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v50, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v48, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v38, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v34, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v32, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v29, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v30, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 +; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v29, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v30, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v32, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v34, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v38, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v48, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v50, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v52, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v53, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v54, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v55, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v64, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v66, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -20999,149 +21642,189 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v52f16_to_v52i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v26 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v66, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v64, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v55, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v54, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v53, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v52, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v50, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v48, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v38, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v36, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v34, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v32, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v29, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v30, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v25 -; GFX11-NEXT: .LBB29_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v29, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v30, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v32, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v34, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v36, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v38, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v48, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v50, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v52, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v53, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v54, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v55, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v64, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v66, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v52i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v52i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v66, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v64, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v55, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v54, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v53, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v52, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v50, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v48, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v38, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v34, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v32, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v29, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v30, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v29, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v30, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v32, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v34, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v38, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v48, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v50, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v52, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v53, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v54, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v55, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v64, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v66, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index d1531b389ac42..cdbe26b309831 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <28 x float> @bitcast_v28i32_to_v28f32(<28 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v28i32_to_v28f32: @@ -1716,163 +1717,205 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v28i32_to_v56i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB6_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB6_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v28i32_to_v56i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v28i32_to_v56i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3010,103 +3053,145 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v56i16_to_v28i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4127,163 +4212,205 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v28i32_to_v56f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB8_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB8_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v28i32_to_v56f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v28i32_to_v56f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5604,103 +5731,145 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v56f16_to_v28i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7051,149 +7220,177 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v28f32_to_v56i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB14_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB14_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8331,103 +8528,145 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v56i16_to_v28f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB15_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9448,149 +9687,177 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v28f32_to_v56f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB16_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB16_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10911,103 +11178,145 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v56f16_to_v28f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB17_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12022,170 +12331,219 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v14i64_to_v56i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB20_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB20_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13323,103 +13681,145 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v56i16_to_v14i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB21_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -14440,170 +14840,219 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v14i64_to_v56f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB22_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB22_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -15924,103 +16373,145 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v56f16_to_v14i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -16675,149 +17166,177 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v14f64_to_v56i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB24_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB24_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -17955,103 +18474,145 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v56i16_to_v14f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -19016,149 +19577,177 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v14f64_to_v56f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: ; implicit-def: $vgpr29 -; GFX11-NEXT: ; implicit-def: $vgpr28 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB26_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-NEXT: .LBB26_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -20479,103 +21068,145 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v56f16_to_v14f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB27_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -21817,159 +22448,201 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v56i16_to_v56f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v70, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v69, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v68, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v67, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v66, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v65, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v64, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v55, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v54, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v53, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v51, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v49, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v39, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v35, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v31, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v33, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v27 -; GFX11-NEXT: .LBB28_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v31, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v33, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v35, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v39, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v49, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v51, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v53, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v54, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v55, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v64, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v65, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v66, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v67, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v68, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v69, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v70, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v56f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v56f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v70, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v69, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v68, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v67, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v66, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v64, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v55, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v54, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v53, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v51, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v49, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v39, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v35, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v31, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v33, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27 +; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v31, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v33, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v35, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v39, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v49, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v51, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v53, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v54, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v55, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v64, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v66, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v67, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v68, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v69, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v70, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -22938,159 +23611,201 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v56f16_to_v56i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v70, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v69, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v68, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v67, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v66, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v65, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v64, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v55, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v54, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v53, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v51, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v49, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v39, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v35, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v31, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v33, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v27 -; GFX11-NEXT: .LBB29_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v31, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v33, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v35, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v39, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v49, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v51, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v53, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v54, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v55, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v64, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v65, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v66, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v67, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v68, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v69, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v70, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v56i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v56i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v70, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v69, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v68, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v67, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v66, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v64, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v55, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v54, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v53, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v51, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v49, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v39, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v35, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v31, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v33, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v31, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v33, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v35, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v39, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v49, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v51, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v53, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v54, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v55, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v64, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v66, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v67, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v68, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v69, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v70, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index b60649cc23590..2837f2b2bd7fa 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <30 x float> @bitcast_v30i32_to_v30f32(<30 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v30i32_to_v30f32: @@ -1820,173 +1821,217 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v30i32_to_v60i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: ; implicit-def: $vgpr83 -; GFX11-NEXT: ; implicit-def: $vgpr82 -; GFX11-NEXT: ; implicit-def: $vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr80 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB6_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB6_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3218,109 +3263,153 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v60i16_to_v30i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4470,173 +4559,217 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v30i32_to_v60f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: ; implicit-def: $vgpr83 -; GFX11-NEXT: ; implicit-def: $vgpr82 -; GFX11-NEXT: ; implicit-def: $vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr80 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB8_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB8_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6060,109 +6193,153 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v60f16_to_v30i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7600,158 +7777,187 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v30f32_to_v60i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: ; implicit-def: $vgpr83 -; GFX11-NEXT: ; implicit-def: $vgpr82 -; GFX11-NEXT: ; implicit-def: $vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr80 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB14_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB14_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8983,109 +9189,153 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v60i16_to_v30f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB15_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10235,158 +10485,187 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v30f32_to_v60f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: ; implicit-def: $vgpr83 -; GFX11-NEXT: ; implicit-def: $vgpr82 -; GFX11-NEXT: ; implicit-def: $vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr80 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB16_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB16_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11810,109 +12089,153 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v60f16_to_v30f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB17_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13000,181 +13323,233 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v15i64_to_v60i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: ; implicit-def: $vgpr83 -; GFX11-NEXT: ; implicit-def: $vgpr82 -; GFX11-NEXT: ; implicit-def: $vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr80 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB20_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB20_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -14406,109 +14781,153 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v60i16_to_v15i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB21_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -15658,181 +16077,233 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v15i64_to_v60f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: ; implicit-def: $vgpr83 -; GFX11-NEXT: ; implicit-def: $vgpr82 -; GFX11-NEXT: ; implicit-def: $vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr80 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB22_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB22_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -17256,109 +17727,153 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v60f16_to_v15i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -18074,158 +18589,187 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v15f64_to_v60i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: ; implicit-def: $vgpr83 -; GFX11-NEXT: ; implicit-def: $vgpr82 -; GFX11-NEXT: ; implicit-def: $vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr80 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB24_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB24_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -19457,109 +20001,153 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v60i16_to_v15f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -20639,158 +21227,187 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v15f64_to_v60f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: ; implicit-def: $vgpr83 -; GFX11-NEXT: ; implicit-def: $vgpr82 -; GFX11-NEXT: ; implicit-def: $vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr80 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr67 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr55 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr39 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: ; implicit-def: $vgpr32 -; GFX11-NEXT: ; implicit-def: $vgpr31 -; GFX11-NEXT: ; implicit-def: $vgpr30 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB26_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-NEXT: .LBB26_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -22214,109 +22831,153 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v60f16_to_v15f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB27_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -23682,169 +24343,213 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v60i16_to_v60f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 -; GFX11-NEXT: .LBB28_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v60f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v60f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -24848,169 +25553,213 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v60f16_to_v60i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 -; GFX11-NEXT: .LBB29_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 -; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 -; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 -; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 -; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 -; GFX11-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 -; GFX11-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 -; GFX11-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 -; GFX11-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 -; GFX11-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 -; GFX11-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 -; GFX11-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 -; GFX11-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 -; GFX11-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 -; GFX11-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 -; GFX11-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 -; GFX11-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 -; GFX11-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 -; GFX11-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v60i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v60i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index e7f48435f0ad2..198bf839cb1cb 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -3414,54 +3416,103 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; -------------------------------------------------------------------- define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v5, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v3 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -3497,47 +3548,89 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v3 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -3764,53 +3857,101 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr } define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v3, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -3845,46 +3986,87 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: v_mov_b32_e32 v3, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v1, v1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -4103,86 +4285,167 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ } define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX12-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v11, v7 -; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX12-NEXT: v_mov_b32_e32 v7, v8 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB15_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 +; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-TRUE16-NEXT: ; %bb.2: +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v11, v7 +; GFX12-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-FAKE16-NEXT: ; %bb.2: +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -4251,82 +4514,159 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX11-NEXT: v_and_b32_e32 v10, -4, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v11, v7 -; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v6, v6, v5 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6 -; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB15_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 +; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-TRUE16-NEXT: ; %bb.2: +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v11, v7 +; GFX11-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-FAKE16-NEXT: ; %bb.2: +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -4745,64 +5085,124 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; -------------------------------------------------------------------- define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -4846,57 +5246,110 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -5151,63 +5604,122 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine } define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -5250,56 +5762,108 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -5518,125 +6082,218 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 -; GFX6-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB17_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 - %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB18_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB17_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-TRUE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-TRUE16-NEXT: ; %bb.2: +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX12-TRUE16-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_3 +; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-FAKE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-FAKE16-NEXT: ; %bb.2: +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_3 +; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -5713,94 +6370,184 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB18_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-TRUE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-TRUE16-NEXT: ; %bb.2: +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-FAKE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-FAKE16-NEXT: ; %bb.2: +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB18_3: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: ; Child Loop BB18_4 Depth 2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 +; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8338,58 +9085,113 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8721,54 +9523,105 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9149,91 +10002,176 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB28_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB28_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 -; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB28_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-TRUE16-NEXT: ; %bb.2: +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX11-TRUE16-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_add_f32 v4, v4, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_4 +; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_3 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-FAKE16-NEXT: ; %bb.2: +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB28_3: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: ; Child Loop BB28_4 Depth 2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 +; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_4 +; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_3 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9755,58 +10693,113 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset: ; GFX10: ; %bb.0: @@ -10138,54 +11131,105 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset: ; GFX10: ; %bb.0: @@ -10514,58 +11558,113 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: @@ -10897,54 +11996,105 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: @@ -11272,54 +12422,105 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index b0447194412d8..bee2813ca30f0 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -2482,56 +2484,107 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; -------------------------------------------------------------------- define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -2569,50 +2622,95 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_max_f16_e32 v0, v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -2847,55 +2945,105 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr } define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -2932,49 +3080,93 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -3201,89 +3393,172 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ } define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB12_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-TRUE16-NEXT: ; %bb.2: +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 +; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-FAKE16-NEXT: ; %bb.2: +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v10, v5, v5 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v10 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 +; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -3354,85 +3629,164 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-NEXT: v_max_f16_e32 v4, v4, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB12_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-TRUE16-NEXT: ; %bb.2: +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-FAKE16-NEXT: ; %bb.2: +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v10, v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -3859,64 +4213,124 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; -------------------------------------------------------------------- define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -3960,57 +4374,110 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -4267,63 +4734,122 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine } define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -4366,56 +4892,108 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -4663,98 +5241,191 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ret void } -define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB15_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-TRUE16-NEXT: ; %bb.2: +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v10 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-FAKE16-NEXT: ; %bb.2: +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v10 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -4831,94 +5502,184 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_max_f32_e32 v4, v4, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB15_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-TRUE16-NEXT: ; %bb.2: +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-FAKE16-NEXT: ; %bb.2: +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6614,61 +7375,120 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; -------------------------------------------------------------------- define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6715,58 +7535,113 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7039,56 +7914,109 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu } define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v5 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_max_num_f32 v0, v0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7134,54 +8062,105 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_max_f32 v0, v0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7450,95 +8429,186 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi } define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8 -; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB21_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-TRUE16-NEXT: ; %bb.2: +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX12-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v8 :: v_dual_max_num_f32 v4, v4, v9 +; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-FAKE16-NEXT: ; %bb.2: +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8 +; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7618,91 +8688,176 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8 -; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB21_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-TRUE16-NEXT: ; %bb.2: +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX11-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v8 :: v_dual_max_f32 v4, v4, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-FAKE16-NEXT: ; %bb.2: +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index e33c8aa30391d..1826743ed017d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -2482,56 +2484,107 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; -------------------------------------------------------------------- define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -2569,50 +2622,95 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_min_f16_e32 v0, v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -2847,55 +2945,105 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr } define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 -; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -2932,49 +3080,93 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -3201,89 +3393,172 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ } define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB12_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-TRUE16-NEXT: ; %bb.2: +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 +; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-FAKE16-NEXT: ; %bb.2: +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v10, v5, v5 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, v4, v10 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 +; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 +; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -3354,85 +3629,164 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-NEXT: v_min_f16_e32 v4, v4, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB12_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-TRUE16-NEXT: ; %bb.2: +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-FAKE16-NEXT: ; %bb.2: +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v10, v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, v4, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 +; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_4 +; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_3 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -3859,64 +4213,124 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; -------------------------------------------------------------------- define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -3960,57 +4374,110 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -4267,63 +4734,122 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine } define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_b32 s4, s16, -4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v2, s4 -; GFX12-NEXT: s_and_b32 s4, s16, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s4, s4, 3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen -; GFX12-NEXT: s_not_b32 s6, s5 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v4 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -4366,56 +4892,108 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_addk_i32 s16, 0x200 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_and_b32 s4, s16, -4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: s_and_b32 s4, s16, 3 -; GFX11-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen -; GFX11-NEXT: s_not_b32 s6, s5 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-FAKE16-NEXT: s_not_b32 s6, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -4663,98 +5241,191 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ret void } -define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-NEXT: v_not_b32_e32 v9, v6 -; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB15_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-TRUE16-NEXT: ; %bb.2: +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v10 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-FAKE16-NEXT: ; %bb.2: +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v10 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -4831,94 +5502,184 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-NEXT: v_not_b32_e32 v9, v6 -; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_min_f32_e32 v4, v4, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB15_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-TRUE16-NEXT: ; %bb.2: +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-FAKE16-NEXT: ; %bb.2: +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 +; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6614,61 +7375,120 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; -------------------------------------------------------------------- define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v4, s4 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v0 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v1, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6715,58 +7535,113 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 -; GFX11-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 -; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7039,56 +7914,109 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu } define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 -; GFX12-NEXT: s_mov_b32 s5, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v5 -; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_min_num_f32 v0, v0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 +; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7134,54 +8062,105 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX942-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-NEXT: s_add_i32 s4, s16, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 -; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 -; GFX11-NEXT: s_mov_b32 s5, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 -; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_min_f32 v0, v0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7450,95 +8429,186 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi } define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 -; GFX12-NEXT: s_mov_b32 s1, exec_lo -; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 -; GFX12-NEXT: ; implicit-def: $vgpr4 -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Loop Header: Depth=1 -; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX12-NEXT: s_mov_b32 s2, exec_lo -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8 -; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v1 -; GFX12-NEXT: v_readfirstlane_b32 s6, v2 -; GFX12-NEXT: v_readfirstlane_b32 s7, v3 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_saveexec_b32 s0, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_4 -; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX12-NEXT: s_mov_b32 exec_lo, s2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v4 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB21_3 -; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-TRUE16-NEXT: ; %bb.2: +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX12-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v8 :: v_dual_min_num_f32 v4, v4, v9 +; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GFX12-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 +; GFX12-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-FAKE16-NEXT: ; %bb.2: +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8 +; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7618,91 +8688,176 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 -; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Loop Header: Depth=1 -; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX11-NEXT: s_mov_b32 s2, exec_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8 -; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 -; GFX11-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 -; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-NEXT: v_readfirstlane_b32 s5, v1 -; GFX11-NEXT: v_readfirstlane_b32 s6, v2 -; GFX11-NEXT: v_readfirstlane_b32 s7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc -; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_4 -; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB21_3 -; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-TRUE16-NEXT: ; %bb.2: +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; GFX11-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v8 :: v_dual_min_f32 v4, v4, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-FAKE16-NEXT: ; %bb.2: +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 +; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] +; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc +; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir index f9db082a2e912..9b6a2f3a1aa1e 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir @@ -57,6 +57,57 @@ body: | %4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec ... +--- +name: salu16_usedby_salu32 +body: | + bb.0: + ; GCN-LABEL: name: salu16_usedby_salu32 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[DEF]].lo16, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_TRUNC_F16_t16_e64_]], %subreg.lo16, [[DEF2]], %subreg.hi16 + ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[REG_SEQUENCE]], [[DEF]], implicit $exec + %0:vgpr_32 = IMPLICIT_DEF + %1:sreg_32 = COPY %0:vgpr_32 + %2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode + %3:sreg_32 = S_XOR_B32 %2:sreg_32, %1:sreg_32, implicit-def $scc +... + +--- +name: salu32_usedby_salu16 +body: | + bb.0: + ; GCN-LABEL: name: salu32_usedby_salu16 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[DEF]], [[DEF]], implicit $exec + ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[V_XOR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec + %0:vgpr_32 = IMPLICIT_DEF + %1:sreg_32 = COPY %0:vgpr_32 + %2:sreg_32 = S_XOR_B32 %1:sreg_32, %1:sreg_32, implicit-def $scc + %3:sreg_32 = S_TRUNC_F16 %2:sreg_32, implicit $mode +... + +--- +name: S_FMAC_F16 +body: | + bb.0: + ; GCN-LABEL: name: S_FMAC_F16 + ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_lo16 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF2]], %subreg.hi16 + ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF3]], %subreg.hi16 + ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec + %0:vgpr_16 = IMPLICIT_DEF + %1:sgpr_lo16 = COPY %0:vgpr_16 + %2:sreg_32 = COPY %0:vgpr_16 + %3:sreg_32 = COPY %1:sgpr_lo16 + %4:sreg_32 = S_FMAC_F16 %3:sreg_32, %3:sreg_32, %2:sreg_32, implicit $mode +... + --- name: vgpr16_to_spgr32 body: | diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index fdc15a301164a..e13c895a1cc85 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -8147,50 +8149,95 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; -------------------------------------------------------------------- define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8225,45 +8272,85 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8436,51 +8523,97 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % } define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8517,46 +8650,87 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8735,51 +8909,97 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain } define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8817,46 +9037,87 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9035,48 +9296,91 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain } define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9110,43 +9414,81 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9313,49 +9655,93 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr } define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9391,44 +9777,83 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9601,49 +10026,93 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra } define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9680,44 +10149,83 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9890,37 +10398,69 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra } define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9946,32 +10486,59 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10101,43 +10668,77 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] + ret void +} + +define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10164,34 +10765,63 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10328,52 +10958,99 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi } define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10410,46 +11087,87 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10630,50 +11348,95 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai } define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10709,44 +11472,83 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10925,59 +11727,114 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; -------------------------------------------------------------------- define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11021,54 +11878,104 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11268,61 +12175,118 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt } define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11368,56 +12332,108 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11623,61 +12639,118 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr } define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11724,56 +12797,108 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11972,66 +13097,121 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 -1024 - %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret bfloat %result - } - -define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr %ptr, i64 -1024 + %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result + } + +define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -12076,54 +13256,104 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -12323,59 +13553,114 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr } define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -12421,54 +13706,104 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -12668,49 +14003,94 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr } define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -12747,44 +14127,84 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -12954,47 +14374,90 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no } define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -13030,42 +14493,80 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -13232,57 +14733,110 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no } define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -13325,52 +14879,100 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -13564,62 +15166,120 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt } define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -13665,56 +15325,108 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -13912,70 +15624,126 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 - %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret bfloat %result -} - -define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result +} + +define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14020,54 +15788,104 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16574,54 +18392,104 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB68_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB68_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB68_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16850,54 +18718,104 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB69_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB69_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB69_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -17132,59 +19050,113 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 -; GFX11-NEXT: flat_load_b32 v0, v[4:5] -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB70_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5] +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5] +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -17422,52 +19394,100 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB71_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB71_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB71_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -17690,52 +19710,100 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB72_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB72_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -17968,57 +20036,110 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v3, v[3:4] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB73_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB73_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB73_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -18257,54 +20378,104 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB74_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB74_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -18539,52 +20710,100 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB75_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB75_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB75_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -18816,54 +21035,104 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB76_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB76_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: @@ -19092,52 +21361,100 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB77_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB77_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB77_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: @@ -19360,54 +21677,104 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB78_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB78_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: @@ -19636,52 +22003,100 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB79_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB79_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB79_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index b29a5d0920030..d2cbc25bf7e04 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -6019,52 +6021,99 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; -------------------------------------------------------------------- define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6101,47 +6150,89 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6322,53 +6413,103 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % } define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6407,48 +6548,93 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6635,53 +6821,103 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain } define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6721,48 +6957,93 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6949,51 +7230,97 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain } define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7029,46 +7356,87 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7243,52 +7611,101 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr } define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7326,47 +7743,91 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7547,52 +8008,101 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra } define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7631,47 +8141,91 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7852,41 +8406,77 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra } define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7915,36 +8505,67 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8089,40 +8710,75 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi } define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8150,35 +8806,65 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8320,54 +9006,105 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ } define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8406,48 +9143,93 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8636,53 +9418,103 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai } define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8720,47 +9552,91 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8947,59 +9823,114 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; -------------------------------------------------------------------- define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9043,54 +9974,104 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9291,61 +10272,118 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt } define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9391,56 +10429,108 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9647,61 +10737,118 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr } define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9748,56 +10895,108 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10004,57 +11203,110 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr } define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10093,56 +11345,104 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB39_1 -; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10337,59 +11637,114 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt } define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10434,54 +11789,104 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10682,59 +12087,114 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr } define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10780,54 +12240,104 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11028,49 +12538,94 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr } define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11107,44 +12662,84 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11315,47 +12910,90 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no } define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11391,42 +13029,80 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11572,84 +13248,142 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 - %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11695,56 +13429,108 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11953,60 +13739,116 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g } define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -12051,54 +13893,104 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14234,57 +16126,111 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; -------------------------------------------------------------------- define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14328,54 +16274,104 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14581,57 +16577,111 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m } define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14675,54 +16725,104 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14931,57 +17031,111 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no } define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -15011,80 +17165,134 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 -; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX942-NEXT: s_cbranch_execnz .LBB56_1 -; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 -; GFX11-NEXT: flat_load_b32 v0, v[4:5] -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB56_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5] +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5] +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -15299,55 +17507,107 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no } define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -15390,52 +17650,100 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -15635,55 +17943,107 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( } define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -15726,52 +18086,100 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -15978,55 +18386,107 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ } define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16075,57 +18535,110 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v3, v[3:4] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16340,58 +18853,113 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ } define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16435,54 +19003,104 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16693,56 +19311,109 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n } define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16785,52 +19456,100 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 9b682179aa279..805848fc3e1cc 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -6019,52 +6021,99 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; -------------------------------------------------------------------- define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6101,47 +6150,89 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6322,53 +6413,103 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % } define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6407,48 +6548,93 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6635,53 +6821,103 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain } define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6721,48 +6957,93 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6949,51 +7230,97 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain } define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7029,46 +7356,87 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7243,52 +7611,101 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr } define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7326,47 +7743,91 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7547,52 +8008,101 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra } define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7631,47 +8141,91 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7852,41 +8406,77 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra } define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7915,36 +8505,67 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8089,40 +8710,75 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi } define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8150,35 +8806,65 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8320,54 +9006,105 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ } define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8406,48 +9143,93 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8636,53 +9418,103 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai } define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8720,47 +9552,91 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8947,59 +9823,114 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; -------------------------------------------------------------------- define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9043,54 +9974,104 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9291,61 +10272,118 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt } define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9391,56 +10429,108 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9647,61 +10737,118 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr } define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9748,56 +10895,108 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10004,57 +11203,110 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr } define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10093,56 +11345,104 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: v_mov_b32_e32 v5, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB39_1 -; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10337,59 +11637,114 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt } define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10434,54 +11789,104 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10682,59 +12087,114 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr } define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10780,54 +12240,104 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11028,49 +12538,94 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr } define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11107,44 +12662,84 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11315,47 +12910,90 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no } define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11391,42 +13029,80 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11572,84 +13248,142 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB43_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 - %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11695,56 +13429,108 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11953,60 +13739,116 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g } define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -12051,54 +13893,104 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14234,57 +16126,111 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; -------------------------------------------------------------------- define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14328,54 +16274,104 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14581,57 +16577,111 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m } define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14675,54 +16725,104 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14931,57 +17031,111 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no } define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -15011,80 +17165,134 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 -; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 -; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX942-NEXT: s_cbranch_execnz .LBB56_1 -; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 -; GFX11-NEXT: flat_load_b32 v0, v[4:5] -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4 +; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4 +; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] +; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX942-NEXT: buffer_wbl2 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: buffer_inv sc1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_cbranch_execnz .LBB56_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5] +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5] +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -15299,55 +17507,107 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no } define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -15390,52 +17650,100 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -15635,55 +17943,107 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( } define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -15726,52 +18086,100 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -15978,55 +18386,107 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ } define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16075,57 +18535,110 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v3, v[3:4] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16340,58 +18853,113 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ } define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16435,54 +19003,104 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16693,56 +19311,109 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n } define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16785,52 +19456,100 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 98d7d259562b0..e0138d58963c8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -5832,50 +5834,95 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; -------------------------------------------------------------------- define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16: ; GFX942: ; %bb.0: @@ -5910,45 +5957,85 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16: ; GFX10: ; %bb.0: @@ -6121,51 +6208,97 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { } define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: ; GFX942: ; %bb.0: @@ -6202,46 +6335,87 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos: ; GFX10: ; %bb.0: @@ -6420,51 +6594,97 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) } define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: ; GFX942: ; %bb.0: @@ -6502,46 +6722,87 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg: ; GFX10: ; %bb.0: @@ -6720,48 +6981,91 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) } define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16: ; GFX942: ; %bb.0: @@ -6795,43 +7099,81 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16: ; GFX10: ; %bb.0: @@ -6998,49 +7340,93 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { } define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: ; GFX942: ; %bb.0: @@ -7076,44 +7462,83 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: ; GFX10: ; %bb.0: @@ -7286,49 +7711,93 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val } define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: ; GFX942: ; %bb.0: @@ -7358,51 +7827,90 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v5, v4 -; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_cbranch_execnz .LBB27_1 -; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: v_mov_b32_e32 v5, v4 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB27_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: ; GFX10: ; %bb.0: @@ -7575,39 +8083,73 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val } define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f16_e32 v3, v4, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: ; GFX942: ; %bb.0: @@ -7634,34 +8176,63 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v3, v4, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: ; GFX10: ; %bb.0: @@ -7798,37 +8369,69 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal } define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_sub_f16_e32 v3, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: ; GFX942: ; %bb.0: @@ -7854,32 +8457,59 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_sub_f16_e32 v3, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: ; GFX10: ; %bb.0: @@ -8013,52 +8643,99 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h } define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: ; GFX942: ; %bb.0: @@ -8095,46 +8772,87 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos: ; GFX10: ; %bb.0: @@ -8315,50 +9033,95 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) } define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 { -; GFX12-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: ; GFX942: ; %bb.0: @@ -8394,44 +9157,83 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: ; GFX10: ; %bb.0: @@ -8610,59 +9412,114 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; -------------------------------------------------------------------- define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16: ; GFX942: ; %bb.0: @@ -8706,54 +9563,104 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16: ; GFX10: ; %bb.0: @@ -8950,64 +9857,121 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, bfloat %val syncscope("agent") seq_cst ret bfloat %result -} - -define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +} + +define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -9053,56 +10017,108 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -9308,61 +10324,118 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % } define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: ; GFX942: ; %bb.0: @@ -9409,56 +10482,108 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg: ; GFX10: ; %bb.0: @@ -9664,57 +10789,110 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % } define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16: ; GFX942: ; %bb.0: @@ -9757,52 +10935,100 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16: ; GFX10: ; %bb.0: @@ -9996,59 +11222,114 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { } define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -10093,54 +11374,104 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -10340,59 +11671,114 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % } define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: ; GFX942: ; %bb.0: @@ -10438,54 +11824,104 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: ; GFX10: ; %bb.0: @@ -10665,69 +12101,114 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, v2 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB37_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 -1024 - %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst - ret void -} - -define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr %ptr, i64 -1024 + %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst + ret void +} + +define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: ; GFX942: ; %bb.0: @@ -10764,44 +12245,84 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: ; GFX10: ; %bb.0: @@ -10971,47 +12492,90 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, } define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: ; GFX942: ; %bb.0: @@ -11047,42 +12611,80 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: ; GFX10: ; %bb.0: @@ -11249,62 +12851,120 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, } define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -11350,56 +13010,108 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -11607,60 +13319,116 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat } define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 { -; GFX12-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -11705,54 +13473,104 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -13743,57 +15561,111 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; -------------------------------------------------------------------- define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16: ; GFX942: ; %bb.0: @@ -13837,54 +15709,104 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16: ; GFX10: ; %bb.0: @@ -14090,57 +16012,111 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v } define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -14184,55 +16160,105 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -14440,57 +16466,111 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, } define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: ; GFX942: ; %bb.0: @@ -14541,59 +16621,113 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 -; GFX11-NEXT: flat_load_b32 v0, v[4:5] -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5] +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 +; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5] +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: @@ -14808,55 +16942,107 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, } define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX942: ; %bb.0: @@ -14899,52 +17085,100 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: @@ -15144,55 +17378,107 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 } define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -15235,52 +17521,100 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -15487,55 +17821,107 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b } define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX942: ; %bb.0: @@ -15584,57 +17970,110 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b32 v3, v[3:4] -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: @@ -15849,58 +18288,113 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b } define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -15944,54 +18438,104 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -16202,56 +18746,109 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, } define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -16294,52 +18891,100 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 690e5cc68747f..bcd5d1e87954f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -1,10 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942-SDAG ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942-GISEL -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11-SDAG -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11-GISEL -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12-SDAG -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12-GISEL +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11-SDAG,GFX11-SDAG-TRUE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11-SDAG,GFX11-SDAG-FAKE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11-GISEL,GFX11-GISEL-TRUE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11-GISEL,GFX11-GISEL-FAKE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12-SDAG,GFX12-SDAG-TRUE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12-SDAG,GFX12-SDAG-FAKE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12-GISEL,GFX12-GISEL-TRUE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12-GISEL,GFX12-GISEL-FAKE16 ; Test flat scratch SVS addressing mode with various combinations of alignment ; of soffset, voffset and inst_offset. @@ -52,24 +56,45 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: soff1_voff1: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: soff1_voff1: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 2, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v2, 4, v2 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v4, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: soff1_voff1: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: soff1_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb @@ -89,19 +114,35 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: soff1_voff1: -; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: soff1_voff1: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 2 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 4 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-SDAG-FAKE16-LABEL: soff1_voff1: +; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff1_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb @@ -177,26 +218,49 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: soff1_voff2: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: soff1_voff2: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 2, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v2, 4, v2 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v4, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: soff1_voff2: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: soff1_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb @@ -219,21 +283,39 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: soff1_voff2: -; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: soff1_voff2: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 2, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-SDAG-FAKE16-LABEL: soff1_voff2: +; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff1_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb @@ -310,26 +392,49 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: soff1_voff4: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: soff1_voff4: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 2, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v2, 4, v2 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v4, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: soff1_voff4: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: soff1_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb @@ -352,21 +457,39 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: soff1_voff4: -; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: soff1_voff4: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 4, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-SDAG-FAKE16-LABEL: soff1_voff4: +; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff1_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb @@ -443,26 +566,49 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: soff2_voff1: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: soff2_voff1: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 2, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v2, 4, v2 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v4, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: soff2_voff1: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: soff2_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb @@ -485,20 +631,37 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: soff2_voff1: -; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: soff2_voff1: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 2 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 4 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v1, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-SDAG-FAKE16-LABEL: soff2_voff1: +; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff2_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb @@ -576,27 +739,51 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: soff2_voff2: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v3, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: soff2_voff2: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 2, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 4, v2 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, off offset:1 dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v3, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: soff2_voff2: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v2, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v3, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: soff2_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb @@ -621,22 +808,41 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: soff2_voff2: -; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: soff2_voff2: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 2, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-SDAG-FAKE16-LABEL: soff2_voff2: +; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff2_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb @@ -716,27 +922,51 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: soff2_voff4: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v3, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: soff2_voff4: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 2, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 4, v2 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, off offset:1 dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v3, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: soff2_voff4: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v2, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v3, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: soff2_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb @@ -761,22 +991,41 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: soff2_voff4: -; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: soff2_voff4: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 4, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-SDAG-FAKE16-LABEL: soff2_voff4: +; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff2_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb @@ -855,26 +1104,49 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: soff4_voff1: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: soff4_voff1: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 2, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v2, 4, v2 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v4, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: soff4_voff1: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v2, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: soff4_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb @@ -897,20 +1169,37 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: soff4_voff1: -; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: soff4_voff1: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 2 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 4 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v1, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-SDAG-FAKE16-LABEL: soff4_voff1: +; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff4_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb @@ -988,27 +1277,51 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: soff4_voff2: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v5, v3, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: soff4_voff2: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 2, v2 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 4, v2 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, off offset:1 dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v3, v0, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v4, v1, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: soff4_voff2: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v2, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v3, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: soff4_voff2: ; GFX11-GISEL: ; %bb.0: ; %bb @@ -1033,22 +1346,41 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: soff4_voff2: -; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: soff4_voff2: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 2, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-SDAG-FAKE16-LABEL: soff4_voff2: +; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff4_voff2: ; GFX12-GISEL: ; %bb.0: ; %bb @@ -1127,26 +1459,49 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: soff4_voff4: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v3, v4, off dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: soff4_voff4: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 4, v2 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, off offset:1 dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, off offset:2 dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v1, off dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: soff4_voff4: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v4, 4 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v3, v4, off dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: soff4_voff4: ; GFX11-GISEL: ; %bb.0: ; %bb @@ -1171,22 +1526,41 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: soff4_voff4: -; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: soff4_voff4: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 4, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-SDAG-FAKE16-LABEL: soff4_voff4: +; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff4_voff4: ; GFX12-GISEL: ; %bb.0: ; %bb @@ -1246,16 +1620,28 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) { ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: soff1_voff1_negative: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: soff1_voff1_negative: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, 0, s0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v1, v0, off offset:-1 dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: soff1_voff1_negative: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: soff1_voff1_negative: ; GFX11-GISEL: ; %bb.0: ; %bb @@ -1268,14 +1654,24 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) { ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm ; -; GFX12-SDAG-LABEL: soff1_voff1_negative: -; GFX12-SDAG: ; %bb.0: ; %bb -; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:-1 scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: soff1_voff1_negative: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v1, v0, s0 offset:-1 scope:SCOPE_SYS +; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-SDAG-FAKE16-LABEL: soff1_voff1_negative: +; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:-1 scope:SCOPE_SYS +; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: soff1_voff1_negative: ; GFX12-GISEL: ; %bb.0: ; %bb @@ -1296,3 +1692,10 @@ bb: store volatile i8 1, ptr addrspace(5) %p1 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-GISEL-FAKE16: {{.*}} +; GFX11-GISEL-TRUE16: {{.*}} +; GFX11-SDAG: {{.*}} +; GFX12-GISEL-FAKE16: {{.*}} +; GFX12-GISEL-TRUE16: {{.*}} +; GFX12-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 125d009429cbf..7a1351174733b 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -6,7 +6,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f16: @@ -255,42 +256,81 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX1150-LABEL: frem_f16: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX1150-NEXT: s_waitcnt vmcnt(1) -; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_fmac_f32_e32 v3, v5, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 -; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1150-NEXT: s_endpgm +; GFX1150-TRUE16-LABEL: frem_f16: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l +; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1150-TRUE16-NEXT: s_endpgm +; +; GFX1150-FAKE16-LABEL: frem_f16: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1150-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -456,26 +496,47 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX1150-LABEL: fast_frem_f16: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_rcp_f16_e32 v3, v2 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3 -; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 -; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1150-NEXT: s_endpgm +; GFX1150-TRUE16-LABEL: fast_frem_f16: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h +; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1150-TRUE16-NEXT: s_endpgm +; +; GFX1150-FAKE16-LABEL: fast_frem_f16: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1150-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -641,26 +702,47 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX1150-LABEL: unsafe_frem_f16: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_rcp_f16_e32 v3, v2 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3 -; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 -; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1150-NEXT: s_endpgm +; GFX1150-TRUE16-LABEL: unsafe_frem_f16: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h +; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1150-TRUE16-NEXT: s_endpgm +; +; GFX1150-FAKE16-LABEL: unsafe_frem_f16: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1150-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -2308,68 +2390,130 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX1150-LABEL: frem_v2f16: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 -; GFX1150-NEXT: s_waitcnt vmcnt(1) -; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_fmac_f32_e32 v4, v7, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1150-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v5, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v4, v4 -; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_fmac_f16_e32 v3, v4, v5 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX1150-NEXT: v_rcp_f32_e32 v5, v5 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v5 -; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f32_e32 v4, v6, v5 -; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v5, v6, v5 -; GFX1150-NEXT: v_and_b32_e32 v5, 0xff800000, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4 -; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v2, v1 -; GFX1150-NEXT: v_trunc_f16_e32 v4, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4 -; GFX1150-NEXT: v_fmac_f16_e32 v1, v4, v2 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3 -; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1150-NEXT: s_endpgm +; GFX1150-TRUE16-LABEL: frem_v2f16: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_b32 v3, v1, s[4:5] offset:16 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v5.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v3.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v5 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v5, v6, v5 +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v5, v0 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v0, v2.l, v4.l +; GFX1150-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1150-TRUE16-NEXT: s_endpgm +; +; GFX1150-FAKE16-LABEL: frem_v2f16: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1150-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v5, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v3, v4, v5 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v5 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5 +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v2, v1 +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v4, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX1150-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1150-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8 @@ -3034,115 +3178,226 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX1150-LABEL: frem_v4f16: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v4, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[2:3] -; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 -; GFX1150-NEXT: s_waitcnt vmcnt(1) -; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v8, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v8, v8 -; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_fmac_f32_e32 v6, v9, v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_mul_f32_e32 v8, v9, v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_and_b32_e32 v8, 0xff800000, v8 -; GFX1150-NEXT: v_add_f32_e32 v6, v8, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v7, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v6, v6 -; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_fmac_f16_e32 v5, v6, v7 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX1150-NEXT: v_rcp_f32_e32 v7, v7 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v7 -; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f32_e32 v6, v8, v7 -; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7 -; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_add_f32_e32 v6, v7, v6 -; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v2, v0 -; GFX1150-NEXT: v_trunc_f16_e32 v6, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6 -; GFX1150-NEXT: v_fma_f16 v0, v6, v2, v0 -; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v5 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX1150-NEXT: v_rcp_f32_e32 v7, v7 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f32_e32 v5, v8, v7 -; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7 -; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v6, v2 -; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 -; GFX1150-NEXT: v_fmac_f16_e32 v2, v5, v6 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_fmac_f32_e32 v5, v7, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1150-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v3, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 -; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3 -; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1] -; GFX1150-NEXT: s_endpgm +; GFX1150-TRUE16-LABEL: frem_v4f16: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, 0 +; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7 +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l +; GFX1150-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1] +; GFX1150-TRUE16-NEXT: s_endpgm +; +; GFX1150-FAKE16-LABEL: frem_v4f16: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3] +; GFX1150-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v8, v8 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v8, 0xff800000, v8 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v8, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v7, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v5, v6, v7 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v8, v7 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7 +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v2, v0 +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6 +; GFX1150-FAKE16-NEXT: v_fma_f16 v0, v6, v2, v0 +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7 +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v6, v2 +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5 +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v2, v5, v6 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v5, v3 +; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX1150-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1150-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir index 85c6577893396..ddf3aa2e17ca4 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX11 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GFX11 %s --- name: test_fmamk_reg_imm_f16 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 888c1e225e7c1..13c9ef46ab4e4 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -8252,50 +8254,95 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; -------------------------------------------------------------------- define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8330,45 +8377,85 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8591,51 +8678,97 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr } define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8672,46 +8805,87 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8942,51 +9116,97 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra } define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB46_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9024,46 +9244,87 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB46_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB46_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9294,48 +9555,91 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra } define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB47_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9369,43 +9673,81 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB47_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB47_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9621,49 +9963,93 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p } define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB48_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9699,44 +10085,83 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB48_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB48_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9959,49 +10384,93 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g } define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB49_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10038,44 +10507,83 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB49_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10298,39 +10806,73 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g } define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10357,34 +10899,63 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10560,37 +11131,69 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ } define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10616,32 +11219,59 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v3, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10812,52 +11442,99 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n } define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10894,46 +11571,87 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_add_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11166,50 +11884,95 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr } define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11245,44 +12008,83 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11511,59 +12313,114 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; -------------------------------------------------------------------- define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11607,54 +12464,104 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11904,61 +12811,118 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( } define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -12004,56 +12968,108 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -12302,70 +13318,127 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret bfloat %result -} - -define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret bfloat %result +} + +define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -12412,56 +13485,108 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -12719,57 +13844,110 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ } define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -12812,52 +13990,100 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -13100,59 +14326,114 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( } define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -13197,54 +14478,104 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -13494,59 +14825,114 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ } define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -13592,54 +14978,104 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -13889,49 +15325,94 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ } define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -13968,44 +15449,84 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14214,47 +15735,90 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ } define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14290,42 +15854,80 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14508,83 +16110,141 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v5 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB61_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB62_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB61_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB62_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB62_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14630,56 +16290,108 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB62_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB62_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB62_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14939,60 +16651,116 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine } define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB63_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB63_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB63_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -15037,54 +16805,104 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB63_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB63_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB63_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -18560,54 +20378,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB78_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB78_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -18889,54 +20757,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB79_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB79_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB79_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -19220,54 +21138,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB80_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB80_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB80_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB80_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -19555,52 +21523,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB81_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB81_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB81_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB81_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -19874,52 +21890,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB82_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB82_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB82_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB82_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -20196,52 +22260,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB83_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB83_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB83_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB83_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -20527,54 +22639,104 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB84_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB84_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB84_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB84_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -20861,52 +23023,100 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB85_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB85_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB85_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB85_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -21185,54 +23395,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB86_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB86_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: @@ -21514,52 +23774,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB87_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB87_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB87_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB87_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: @@ -21833,54 +24141,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB88_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB88_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB88_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: @@ -22162,52 +24520,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB89_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB89_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB89_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB89_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: @@ -22481,54 +24887,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB90_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB90_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB90_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB90_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX10: ; %bb.0: @@ -22810,52 +25266,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB91_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB91_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB91_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB91_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index b4286a07bbf7e..a24d6c5ff2222 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -4443,52 +4445,99 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__ ; -------------------------------------------------------------------- define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -4525,47 +4574,89 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -4796,53 +4887,103 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr } define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -4881,48 +5022,93 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -5161,53 +5347,103 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra } define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -5247,48 +5483,93 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -5527,51 +5808,97 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra } define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -5607,46 +5934,87 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -5870,52 +6238,101 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p } define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -5953,47 +6370,91 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6224,52 +6685,101 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g } define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6308,47 +6818,91 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6579,41 +7133,77 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g } define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6642,36 +7232,67 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6855,40 +7476,75 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ } define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6916,35 +7572,65 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v4 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7123,54 +7809,105 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n } define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7209,48 +7946,93 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7491,53 +8273,103 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr } define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7575,47 +8407,91 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7844,67 +8720,122 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -; -------------------------------------------------------------------- -; bfloat -; -------------------------------------------------------------------- - -define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] + ret void +} + +; -------------------------------------------------------------------- +; bfloat +; -------------------------------------------------------------------- + +define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7948,54 +8879,104 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8247,61 +9228,118 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( } define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8347,56 +9385,108 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8656,61 +9746,118 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ } define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8757,56 +9904,108 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9066,57 +10265,110 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ } define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9159,52 +10411,100 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9449,59 +10749,114 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( } define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9546,54 +10901,104 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9836,68 +11241,123 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9943,54 +11403,104 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10242,49 +11752,94 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ } define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10321,44 +11876,84 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10569,47 +12164,90 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ } define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10645,42 +12283,80 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10886,62 +12562,120 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ } define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10987,56 +12721,108 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11298,60 +13084,116 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine } define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11396,54 +13238,104 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14038,57 +15930,111 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin ; -------------------------------------------------------------------- define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14132,55 +16078,105 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -14438,57 +16434,111 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained } define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14532,54 +16582,104 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14840,57 +16940,111 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ } define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14934,54 +17088,104 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -15246,55 +17450,107 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ } define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -15337,52 +17593,100 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -15633,55 +17937,107 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor } define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -15724,52 +18080,100 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16023,55 +18427,107 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin } define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16114,52 +18570,100 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16421,58 +18925,113 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin } define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16516,54 +19075,104 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16826,56 +19435,109 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu } define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16918,52 +19580,100 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_max_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 92a402dc4d65b..5834d4ab4d8e7 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -4443,52 +4445,99 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__ ; -------------------------------------------------------------------- define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -4525,47 +4574,89 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -4796,53 +4887,103 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr } define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -4881,48 +5022,93 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -5161,53 +5347,103 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra } define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -5247,48 +5483,93 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -5527,51 +5808,97 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra } define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -5607,46 +5934,87 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -5870,52 +6238,101 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p } define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -5953,47 +6370,91 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6224,52 +6685,101 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g } define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6308,47 +6818,91 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6579,41 +7133,77 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g } define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6642,36 +7232,67 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -6855,40 +7476,75 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ } define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -6916,35 +7572,65 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v2, v2, v4 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7123,54 +7809,105 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n } define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7209,48 +7946,93 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7491,53 +8273,103 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr } define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7575,47 +8407,91 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -7844,67 +8720,122 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -; -------------------------------------------------------------------- -; bfloat -; -------------------------------------------------------------------- - -define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] + ret void +} + +; -------------------------------------------------------------------- +; bfloat +; -------------------------------------------------------------------- + +define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -7948,54 +8879,104 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8247,61 +9228,118 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( } define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8347,56 +9385,108 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -8656,61 +9746,118 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ } define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -8757,56 +9904,108 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9066,57 +10265,110 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ } define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9159,52 +10411,100 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9449,59 +10749,114 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( } define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9546,54 +10901,104 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -9836,68 +11241,123 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB40_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 - %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 - ret void -} - -define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 + %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -9943,54 +11403,104 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10242,49 +11752,94 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ } define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10321,44 +11876,84 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB42_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10569,47 +12164,90 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ } define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10645,42 +12283,80 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB43_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -10886,62 +12562,120 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ } define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -10987,56 +12721,108 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB44_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -11298,60 +13084,116 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine } define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -11396,54 +13238,104 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB45_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14038,57 +15930,111 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin ; -------------------------------------------------------------------- define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14132,55 +16078,105 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -14438,57 +16434,111 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained } define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14532,54 +16582,104 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -14840,57 +16940,111 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ } define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -14934,54 +17088,104 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -15246,55 +17450,107 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ } define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -15337,52 +17593,100 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -15633,55 +17937,107 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor } define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB58_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -15724,52 +18080,100 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB58_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16023,55 +18427,107 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin } define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB59_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16114,52 +18570,100 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB59_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16421,58 +18925,113 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin } define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB60_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16516,54 +19075,104 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB60_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: @@ -16826,56 +19435,109 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu } define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB61_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: @@ -16918,52 +19580,100 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB61_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 2f5d9d746dc17..765185327a03e 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -5198,50 +5200,95 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1) ; -------------------------------------------------------------------- define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_ret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f16: ; GFX942: ; %bb.0: @@ -5276,45 +5323,85 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_ret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_ret_f16: ; GFX10: ; %bb.0: @@ -5537,51 +5624,97 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) } define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: ; GFX942: ; %bb.0: @@ -5618,46 +5751,87 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: ; GFX10: ; %bb.0: @@ -5888,51 +6062,97 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p } define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: ; GFX942: ; %bb.0: @@ -5970,46 +6190,87 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: ; GFX10: ; %bb.0: @@ -6240,48 +6501,91 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p } define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_noret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f16: ; GFX942: ; %bb.0: @@ -6315,43 +6619,81 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_noret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f16: ; GFX10: ; %bb.0: @@ -6567,49 +6909,93 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val } define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: ; GFX942: ; %bb.0: @@ -6645,44 +7031,83 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: ; GFX10: ; %bb.0: @@ -6905,49 +7330,93 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) } define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: ; GFX942: ; %bb.0: @@ -6984,44 +7453,83 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: ; GFX10: ; %bb.0: @@ -7244,39 +7752,73 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) } define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f16_e32 v3, v4, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: ; GFX942: ; %bb.0: @@ -7303,34 +7845,63 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v3, v4, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB28_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: ; GFX10: ; %bb.0: @@ -7490,53 +8061,85 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 -; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] -; GFX6-NEXT: s_cbranch_execnz .LBB28_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 - %result = atomicrmw fsub ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 - ret half %result -} - -define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_sub_f16_e32 v3, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB28_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 + %result = atomicrmw fsub ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4 + ret half %result +} + +define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 { +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: ; GFX942: ; %bb.0: @@ -7562,32 +8165,59 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_f16_e32 v3, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB29_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: ; GFX10: ; %bb.0: @@ -7758,52 +8388,99 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs } define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: ; GFX942: ; %bb.0: @@ -7840,46 +8517,87 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: ; GFX10: ; %bb.0: @@ -8112,50 +8830,95 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % } define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 { -; GFX12-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: ; GFX942: ; %bb.0: @@ -8191,44 +8954,83 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: ; GFX10: ; %bb.0: @@ -8457,59 +9259,114 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; -------------------------------------------------------------------- define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_ret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16: ; GFX942: ; %bb.0: @@ -8553,54 +9410,104 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_ret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_ret_bf16: ; GFX10: ; %bb.0: @@ -8850,61 +9757,118 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % } define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -8950,56 +9914,108 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -9257,61 +10273,118 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) } define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: ; GFX942: ; %bb.0: @@ -9358,56 +10431,108 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: ; GFX10: ; %bb.0: @@ -9663,59 +10788,112 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) %result = atomicrmw fsub ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst ret bfloat %result } - -define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_noret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v6, v3 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] + +define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 { +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16: ; GFX942: ; %bb.0: @@ -9758,52 +10936,100 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16: ; GFX10: ; %bb.0: @@ -10046,59 +11272,114 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % } define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB36_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -10143,54 +11424,104 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB36_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -10440,59 +11771,114 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) } define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB37_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: ; GFX942: ; %bb.0: @@ -10538,54 +11924,104 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB37_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: ; GFX10: ; %bb.0: @@ -10835,49 +12271,94 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) } define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB38_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: ; GFX942: ; %bb.0: @@ -10914,44 +12395,84 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB38_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: ; GFX10: ; %bb.0: @@ -11160,47 +12681,90 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr } define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB39_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: ; GFX942: ; %bb.0: @@ -11236,42 +12800,80 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB39_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: ; GFX10: ; %bb.0: @@ -11475,62 +13077,120 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr } define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-NEXT: v_not_b32_e32 v4, v4 -; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB40_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -11576,56 +13236,108 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB40_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -11885,60 +13597,116 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 } define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 { -; GFX12-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-NEXT: v_not_b32_e32 v5, v5 -; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB41_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -11977,60 +13745,110 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_cbranch_execnz .LBB41_1 -; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB41_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB41_1 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -14479,57 +16297,111 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; -------------------------------------------------------------------- define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_ret_v2bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2bf16: ; GFX942: ; %bb.0: @@ -14573,54 +16445,104 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_ret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB50_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_ret_v2bf16: ; GFX10: ; %bb.0: @@ -14879,57 +16801,111 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, } define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -14973,54 +16949,104 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB51_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -15281,57 +17307,111 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr } define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: ; GFX942: ; %bb.0: @@ -15375,54 +17455,104 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB52_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: @@ -15687,55 +17817,107 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr } define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_noret_v2bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX942: ; %bb.0: @@ -15778,52 +17960,100 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB53_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: @@ -16074,55 +18304,107 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b } define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -16165,52 +18447,100 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB54_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -16464,55 +18794,107 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( } define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX942: ; %bb.0: @@ -16555,52 +18937,100 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB55_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg: ; GFX10: ; %bb.0: @@ -16862,58 +19292,113 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( } define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -16957,54 +19442,104 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB56_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: @@ -17267,56 +19802,109 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add } define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 { -; GFX12-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX942: ; %bb.0: @@ -17359,52 +19947,100 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB57_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll index fb4c252916b05..947c838740d43 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll @@ -1,7 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1150 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1150,GFX1150-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1150,GFX1150-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s define amdgpu_ps <3 x float> @gather_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, <4 x i32> inreg %samp2, float %s, float %t) { ; GFX11-LABEL: gather_sample: @@ -80,35 +83,69 @@ define amdgpu_ps <3 x float> @sample_gather(<8 x i32> inreg %rsrc, <4 x i32> inr } define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, i16 %s.16, i16 %t.16, i16 %fragid) { -; GFX11-LABEL: sample_load: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_load: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog ; -; GFX1150-LABEL: sample_load: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX1150-NEXT: v_mov_b32_e32 v4, 0 -; GFX1150-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: ; return to shader part epilog +; GFX11-FAKE16-LABEL: sample_load: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: sample_load: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX1150-TRUE16-LABEL: sample_load: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1150-FAKE16-LABEL: sample_load: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_load: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_load: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog %w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i16 %s.16, i16 %t.16, i16 %fragid, <8 x i32> %rsrc2, i32 0, i32 0) @@ -122,35 +159,69 @@ define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg } define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, i16 %s.16, i16 %t.16, i16 %fragid) { -; GFX11-LABEL: load_sample: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: load_sample: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog ; -; GFX1150-LABEL: load_sample: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX1150-NEXT: v_mov_b32_e32 v4, 0 -; GFX1150-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: ; return to shader part epilog +; GFX11-FAKE16-LABEL: load_sample: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog ; -; GFX12-LABEL: load_sample: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX1150-TRUE16-LABEL: load_sample: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1150-FAKE16-LABEL: load_sample: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: load_sample: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: load_sample: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i16 %s.16, i16 %t.16, i16 %fragid, <8 x i32> %rsrc2, i32 0, i32 0) %w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 2bc2a2a745f3f..ae4acfe35d08e 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -1238,48 +1240,91 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; -------------------------------------------------------------------- define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_ret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 -; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX12-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_f16: ; GFX942: ; %bb.0: @@ -1311,42 +1356,79 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_ret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX11-NEXT: v_add_f16_e32 v2, 4.0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_ret_f16: ; GFX10: ; %bb.0: @@ -1543,50 +1625,95 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { } define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_ret_f16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_f16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_f16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_f16__offset: ; GFX942: ; %bb.0: @@ -1619,44 +1746,83 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_ret_f16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-NEXT: v_add_f16_e32 v3, 4.0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_f16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_ret_f16__offset: ; GFX10: ; %bb.0: @@ -1860,47 +2026,89 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_noret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_f16: ; GFX942: ; %bb.0: @@ -1931,41 +2139,77 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_noret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_f16: ; GFX10: ; %bb.0: @@ -2154,48 +2398,91 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_noret_f16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_f16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_f16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_f16__offset: ; GFX942: ; %bb.0: @@ -2227,42 +2514,79 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_noret_f16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v4, 4.0, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_f16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_f16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_f16__offset: ; GFX10: ; %bb.0: @@ -2458,39 +2782,73 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind } define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_ret_f16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v1, 4.0, v2 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_f16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_f16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_f16__offset__align4: ; GFX942: ; %bb.0: @@ -2515,33 +2873,61 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_ret_f16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v1, 4.0, v2 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_f16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_ret_f16__offset__align4: ; GFX10: ; %bb.0: @@ -2696,37 +3082,69 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no } define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_noret_f16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_f16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_f16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_f16__offset__align4: ; GFX942: ; %bb.0: @@ -2750,31 +3168,57 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_noret_f16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v2, 4.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_f16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_f16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_f16__offset__align4: ; GFX10: ; %bb.0: @@ -2927,57 +3371,110 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; -------------------------------------------------------------------- define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_ret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_bf16: ; GFX942: ; %bb.0: @@ -3017,51 +3514,98 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_ret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_ret_bf16: ; GFX10: ; %bb.0: @@ -3259,81 +3803,136 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX6-NEXT: s_cbranch_execnz .LBB14_1 -; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst - ret bfloat %result -} - -define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_ret_bf16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v5, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst + ret bfloat %result +} + +define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwind { +; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_bf16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_bf16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_bf16__offset: ; GFX942: ; %bb.0: @@ -3374,53 +3973,102 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_ret_bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_ret_bf16__offset: ; GFX10: ; %bb.0: @@ -3647,56 +4295,108 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin } define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_noret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_bf16: ; GFX942: ; %bb.0: @@ -3735,50 +4435,96 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_noret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_bf16: ; GFX10: ; %bb.0: @@ -3990,57 +4736,110 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_noret_bf16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_bf16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_bf16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_bf16__offset: ; GFX942: ; %bb.0: @@ -4080,51 +4879,98 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_noret_bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_bf16__offset: ; GFX10: ; %bb.0: @@ -4343,48 +5189,92 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin } define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_ret_bf16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_bf16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_bf16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_ret_bf16__offset__align4: ; GFX942: ; %bb.0: @@ -4418,42 +5308,80 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_ret_bf16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_bf16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_ret_bf16__offset__align4: ; GFX10: ; %bb.0: @@ -4637,46 +5565,88 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) } define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fadd_noret_bf16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_bf16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_bf16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fadd_noret_bf16__offset__align4: ; GFX942: ; %bb.0: @@ -4709,40 +5679,76 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_noret_bf16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_bf16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_bf16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_bf16__offset__align4: ; GFX10: ; %bb.0: @@ -5829,52 +6835,101 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_ret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_ret_v2bf16: ; GFX10: ; %bb.0: @@ -6137,52 +7192,101 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_ret_v2bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_v2bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_v2bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_ret_v2bf16__offset: ; GFX10: ; %bb.0: @@ -6446,50 +7550,96 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_noret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16: ; GFX10: ; %bb.0: @@ -6744,50 +7894,96 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fadd_noret_v2bf16__ofset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_v2bf16__ofset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_v2bf16__ofset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fadd_noret_v2bf16__ofset: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index a0cbc4f538778..28504da5a6833 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -782,49 +784,93 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; -------------------------------------------------------------------- define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_ret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f16: ; GFX942: ; %bb.0: @@ -857,43 +903,81 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_ret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_f16: ; GFX10: ; %bb.0: @@ -1094,51 +1178,97 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { } define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_ret_f16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_f16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_f16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f16__offset: ; GFX942: ; %bb.0: @@ -1172,45 +1302,85 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_ret_f16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v3, 4.0, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_f16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_f16__offset: ; GFX10: ; %bb.0: @@ -1418,48 +1588,91 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_noret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f16: ; GFX942: ; %bb.0: @@ -1491,42 +1704,79 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_noret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_f16: ; GFX10: ; %bb.0: @@ -1719,50 +1969,95 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_noret_f16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_max_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_f16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_f16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f16__offset: ; GFX942: ; %bb.0: @@ -1795,44 +2090,83 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_noret_f16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-NEXT: v_max_f16_e32 v4, 4.0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_f16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_f16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_f16__offset: ; GFX10: ; %bb.0: @@ -2032,40 +2366,75 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind } define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_ret_f16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v1, v2, v2 -; GFX12-NEXT: v_max_num_f16_e32 v1, 4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_f16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_f16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f16__offset__align4: ; GFX942: ; %bb.0: @@ -2091,34 +2460,63 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_ret_f16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX11-NEXT: v_max_f16_e32 v1, 4.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_f16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_f16__offset__align4: ; GFX10: ; %bb.0: @@ -2277,39 +2675,73 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no } define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_noret_f16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v2, 4.0, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_f16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_f16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, 4.0, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f16__offset__align4: ; GFX942: ; %bb.0: @@ -2334,33 +2766,61 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_noret_f16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v2, 4.0, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_f16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_f16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, 4.0, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_f16__offset__align4: ; GFX10: ; %bb.0: @@ -2517,57 +2977,110 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; -------------------------------------------------------------------- define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_ret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, 4.0, v3 -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_bf16: ; GFX942: ; %bb.0: @@ -2607,51 +3120,98 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_ret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_bf16: ; GFX10: ; %bb.0: @@ -2873,59 +3433,114 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { } define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_ret_bf16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v3, 4.0, v3 -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_bf16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_bf16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_bf16__offset: ; GFX942: ; %bb.0: @@ -2966,53 +3581,102 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_ret_bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v3, 4.0, v3 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_bf16__offset: ; GFX10: ; %bb.0: @@ -3241,56 +3905,108 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin } define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_noret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v4, 4.0, v4 -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_bf16: ; GFX942: ; %bb.0: @@ -3325,54 +4041,100 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 -; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: local_atomic_fmax_noret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_bf16: ; GFX10: ; %bb.0: @@ -3586,57 +4348,110 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_noret_bf16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_max_num_f32_e32 v4, 4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_bf16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_bf16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_bf16__offset: ; GFX942: ; %bb.0: @@ -3676,51 +4491,98 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_noret_bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_max_f32_e32 v4, 4.0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_bf16__offset: ; GFX10: ; %bb.0: @@ -3941,48 +4803,92 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin } define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_ret_bf16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-NEXT: v_max_num_f32_e32 v1, 4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_bf16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_bf16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_bf16__offset__align4: ; GFX942: ; %bb.0: @@ -4016,42 +4922,80 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_ret_bf16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_bf16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_bf16__offset__align4: ; GFX10: ; %bb.0: @@ -4237,46 +5181,88 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) } define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmax_noret_bf16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v2, 4.0, v2 -; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_bf16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_bf16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_bf16__offset__align4: ; GFX942: ; %bb.0: @@ -4309,40 +5295,76 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_noret_bf16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v2, 4.0, v2 -; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_bf16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_bf16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, 4.0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_bf16__offset__align4: ; GFX10: ; %bb.0: @@ -5600,57 +6622,111 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; -------------------------------------------------------------------- define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fmax_ret_v2bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_v2bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_v2bf16: ; GFX942: ; %bb.0: @@ -5692,52 +6768,101 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_ret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_v2bf16: ; GFX10: ; %bb.0: @@ -5979,57 +7104,111 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf } define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fmax_ret_v2bf16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_v2bf16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_v2bf16__offset: ; GFX942: ; %bb.0: @@ -6071,52 +7250,101 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_ret_v2bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_max_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_v2bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_ret_v2bf16__offset: ; GFX10: ; %bb.0: @@ -6359,54 +7587,105 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, } define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fmax_noret_v2bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_v2bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX942: ; %bb.0: @@ -6447,50 +7726,96 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_noret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16: ; GFX10: ; %bb.0: @@ -6724,54 +8049,105 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> } define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fmax_noret_v2bf16__ofset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_v2bf16__ofset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16__ofset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX942: ; %bb.0: @@ -6812,50 +8188,96 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmax_noret_v2bf16__ofset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_max_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_v2bf16__ofset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16__ofset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmax_noret_v2bf16__ofset: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index d30d76e5ffda0..48714b7282b1e 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -782,49 +784,93 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; -------------------------------------------------------------------- define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_ret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f16: ; GFX942: ; %bb.0: @@ -857,43 +903,81 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_ret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_f16: ; GFX10: ; %bb.0: @@ -1094,51 +1178,97 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { } define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_ret_f16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v3, 4.0, v3 -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_f16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_f16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f16__offset: ; GFX942: ; %bb.0: @@ -1172,45 +1302,85 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_ret_f16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v3, 4.0, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_f16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_f16__offset: ; GFX10: ; %bb.0: @@ -1418,48 +1588,91 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_noret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f16: ; GFX942: ; %bb.0: @@ -1491,42 +1704,79 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_noret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_f16: ; GFX10: ; %bb.0: @@ -1719,50 +1969,95 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_noret_f16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 -; GFX12-NEXT: v_min_num_f16_e32 v4, 4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_f16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_f16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f16__offset: ; GFX942: ; %bb.0: @@ -1795,44 +2090,83 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_noret_f16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX11-NEXT: v_min_f16_e32 v4, 4.0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_f16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_f16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_f16__offset: ; GFX10: ; %bb.0: @@ -2032,40 +2366,75 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind } define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_ret_f16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f16_e32 v1, v2, v2 -; GFX12-NEXT: v_min_num_f16_e32 v1, 4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_f16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_f16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2 +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f16__offset__align4: ; GFX942: ; %bb.0: @@ -2091,34 +2460,63 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_ret_f16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX11-NEXT: v_min_f16_e32 v1, 4.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_f16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_f16__offset__align4: ; GFX10: ; %bb.0: @@ -2277,39 +2675,73 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no } define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_noret_f16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_max_num_f16_e32 v2, v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f16_e32 v2, 4.0, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_f16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_f16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, 4.0, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f16__offset__align4: ; GFX942: ; %bb.0: @@ -2334,33 +2766,61 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_noret_f16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f16_e32 v2, 4.0, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_f16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_f16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, 4.0, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_f16__offset__align4: ; GFX10: ; %bb.0: @@ -2517,57 +2977,110 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; -------------------------------------------------------------------- define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_ret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v3, 4.0, v3 -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_bf16: ; GFX942: ; %bb.0: @@ -2607,51 +3120,98 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_ret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_bf16: ; GFX10: ; %bb.0: @@ -2873,59 +3433,114 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { } define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_ret_bf16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v3, 4.0, v3 -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_bf16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_bf16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_bf16__offset: ; GFX942: ; %bb.0: @@ -2966,53 +3581,102 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_ret_bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v3, 4.0, v3 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_bf16__offset: ; GFX10: ; %bb.0: @@ -3241,56 +3905,108 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin } define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_noret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v4, 4.0, v4 -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_bf16: ; GFX942: ; %bb.0: @@ -3325,54 +4041,100 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_mov_b32_e32 v3, v4 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 -; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: local_atomic_fmin_noret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_bf16: ; GFX10: ; %bb.0: @@ -3586,57 +4348,110 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_noret_bf16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_min_num_f32_e32 v4, 4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_bf16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_bf16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_bf16__offset: ; GFX942: ; %bb.0: @@ -3676,51 +4491,98 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_noret_bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_min_f32_e32 v4, 4.0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_bf16__offset: ; GFX10: ; %bb.0: @@ -3941,48 +4803,92 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin } define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_ret_bf16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-NEXT: v_min_num_f32_e32 v1, 4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_bf16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_bf16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_bf16__offset__align4: ; GFX942: ; %bb.0: @@ -4016,42 +4922,80 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_ret_bf16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_bf16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_bf16__offset__align4: ; GFX10: ; %bb.0: @@ -4237,46 +5181,88 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) } define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fmin_noret_bf16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f32_e32 v2, 4.0, v2 -; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_bf16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_bf16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_bf16__offset__align4: ; GFX942: ; %bb.0: @@ -4309,40 +5295,76 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_noret_bf16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v2, 4.0, v2 -; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_bf16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_bf16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, 4.0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_bf16__offset__align4: ; GFX10: ; %bb.0: @@ -5600,57 +6622,111 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; -------------------------------------------------------------------- define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fmin_ret_v2bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_v2bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_v2bf16: ; GFX942: ; %bb.0: @@ -5692,52 +6768,101 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_ret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_v2bf16: ; GFX10: ; %bb.0: @@ -5979,57 +7104,111 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf } define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fmin_ret_v2bf16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_v2bf16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_v2bf16__offset: ; GFX942: ; %bb.0: @@ -6071,52 +7250,101 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_ret_v2bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_min_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_v2bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_ret_v2bf16__offset: ; GFX10: ; %bb.0: @@ -6359,54 +7587,105 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, } define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fmin_noret_v2bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_v2bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX942: ; %bb.0: @@ -6447,50 +7726,96 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_noret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16: ; GFX10: ; %bb.0: @@ -6724,54 +8049,105 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> } define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fmin_noret_v2bf16__ofset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_v2bf16__ofset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16__ofset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX942: ; %bb.0: @@ -6812,50 +8188,96 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fmin_noret_v2bf16__ofset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_min_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_v2bf16__ofset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16__ofset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fmin_noret_v2bf16__ofset: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index a8ef8ce1a4074..6879a7cfd09c2 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s @@ -1700,48 +1702,91 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind ; -------------------------------------------------------------------- define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_ret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 -; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX12-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB8_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_f16: ; GFX942: ; %bb.0: @@ -1773,42 +1818,79 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_ret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v0, v4 -; GFX11-NEXT: v_add_f16_e32 v2, -4.0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_ret_f16: ; GFX10: ; %bb.0: @@ -2005,50 +2087,95 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { } define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_ret_f16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB9_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_f16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_f16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_f16__offset: ; GFX942: ; %bb.0: @@ -2081,44 +2208,83 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_ret_f16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-NEXT: v_add_f16_e32 v3, -4.0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB9_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_f16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_ret_f16__offset: ; GFX10: ; %bb.0: @@ -2322,47 +2488,89 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_noret_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_f16: ; GFX942: ; %bb.0: @@ -2393,41 +2601,77 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_noret_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 -; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB10_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_f16: ; GFX10: ; %bb.0: @@ -2616,48 +2860,91 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_noret_f16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_f16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_f16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_f16__offset: ; GFX942: ; %bb.0: @@ -2689,42 +2976,79 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_noret_f16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v4, -4.0, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB11_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_f16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_f16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_f16__offset: ; GFX10: ; %bb.0: @@ -2920,39 +3244,73 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind } define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_ret_f16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f16_e32 v1, -4.0, v2 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_f16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_f16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_f16__offset__align4: ; GFX942: ; %bb.0: @@ -2977,33 +3335,61 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_ret_f16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v1, -4.0, v2 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_f16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_ret_f16__offset__align4: ; GFX10: ; %bb.0: @@ -3158,37 +3544,69 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no } define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_noret_f16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_f16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_f16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_f16__offset__align4: ; GFX942: ; %bb.0: @@ -3212,31 +3630,57 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_noret_f16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v2, -4.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB13_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_f16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_f16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_f16__offset__align4: ; GFX10: ; %bb.0: @@ -3389,57 +3833,110 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; -------------------------------------------------------------------- define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_ret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_bf16: ; GFX942: ; %bb.0: @@ -3479,51 +3976,98 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_ret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB14_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_ret_bf16: ; GFX10: ; %bb.0: @@ -3743,59 +4287,114 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { } define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_ret_bf16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_bf16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_bf16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_bf16__offset: ; GFX942: ; %bb.0: @@ -3836,53 +4435,102 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_ret_bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v3, -4.0, v3 -; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_ret_bf16__offset: ; GFX10: ; %bb.0: @@ -4100,65 +4748,117 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 - %result = atomicrmw fsub ptr addrspace(3) %gep, bfloat 4.0 seq_cst - ret bfloat %result -} - -define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_noret_bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: ds_load_b32 v2, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_not_b32_e32 v3, v3 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX12-NEXT: v_mov_b32_e32 v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 + %result = atomicrmw fsub ptr addrspace(3) %gep, bfloat 4.0 seq_cst + ret bfloat %result +} + +define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { +; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_bf16: ; GFX942: ; %bb.0: @@ -4197,50 +4897,96 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_noret_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: ds_load_b32 v2, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_not_b32_e32 v3, v3 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_bf16: ; GFX10: ; %bb.0: @@ -4452,57 +5198,110 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind { } define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_noret_bf16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_not_b32_e32 v2, v2 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_bf16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_bf16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_bf16__offset: ; GFX942: ; %bb.0: @@ -4542,51 +5341,98 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_noret_bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, -4, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_add_f32_e32 v4, -4.0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_bf16__offset: ; GFX10: ; %bb.0: @@ -4805,48 +5651,92 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin } define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_ret_bf16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_bf16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_bf16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_bf16__offset__align4: ; GFX942: ; %bb.0: @@ -4880,42 +5770,80 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_ret_bf16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_bf16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_ret_bf16__offset__align4: ; GFX10: ; %bb.0: @@ -5099,46 +6027,88 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) } define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind { -; GFX12-LABEL: local_atomic_fsub_noret_bf16__offset__align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-NEXT: v_mov_b32_e32 v1, v2 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_bf16__offset__align4: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_bf16__offset__align4: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_bf16__offset__align4: ; GFX942: ; %bb.0: @@ -5171,40 +6141,76 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_noret_bf16__offset__align4: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v2 -; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_bf16__offset__align4: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_bf16__offset__align4: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_bf16__offset__align4: ; GFX10: ; %bb.0: @@ -6388,57 +7394,111 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; -------------------------------------------------------------------- define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fsub_ret_v2bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_v2bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_v2bf16: ; GFX942: ; %bb.0: @@ -6480,52 +7540,101 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_ret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_ret_v2bf16: ; GFX10: ; %bb.0: @@ -6767,57 +7876,111 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf } define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fsub_ret_v2bf16__offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX12-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16__offset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_v2bf16__offset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_ret_v2bf16__offset: ; GFX942: ; %bb.0: @@ -6859,52 +8022,101 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_ret_v2bf16__offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX11-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 -; GFX11-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16__offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_v2bf16__offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_ret_v2bf16__offset: ; GFX10: ; %bb.0: @@ -7147,54 +8359,105 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, } define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fsub_noret_v2bf16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_v2bf16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX942: ; %bb.0: @@ -7235,50 +8498,96 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_noret_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB26_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_v2bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16: ; GFX10: ; %bb.0: @@ -7512,54 +8821,105 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> } define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x bfloat> %val) { -; GFX12-LABEL: local_atomic_fsub_noret_v2bf16__ofset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX12-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v4 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_v2bf16__ofset: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16__ofset: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE +; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX942: ; %bb.0: @@ -7600,50 +8960,96 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: local_atomic_fsub_noret_v2bf16__ofset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 -; GFX11-NEXT: v_sub_f32_e32 v4, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v4 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB27_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_v2bf16__ofset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5 +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16__ofset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: local_atomic_fsub_noret_v2bf16__ofset: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll index 5476c26e39ba9..14b91793bd8da 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefixes=GFX11 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 define amdgpu_ps i32 @uniform_v_to_s_i32(float inreg %a, float inreg %b) { ; GFX11-LABEL: uniform_v_to_s_i32: @@ -104,14 +105,23 @@ define amdgpu_ps <2 x i16> @uniform_v_to_s_2_i16(float inreg %a, float inreg %b) } define amdgpu_ps i16 @uniform_v_to_s_i16(half inreg %a, half inreg %b) { -; GFX11-LABEL: uniform_v_to_s_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_max_f16_e64 v0, s0, s1 -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: uniform_v_to_s_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1 +; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, s0, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: uniform_v_to_s_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_max_f16_e64 v0, s0, s1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %max = call half @llvm.maximum.f16(half %a, half %b) %cast = bitcast half %max to i16 ret i16 %cast diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 40a4d4af143a4..86fc0ace2c43f 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -1,13 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-FAKE16 %s @@ -41,14 +43,21 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { ; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX11-LABEL: basic_smax_smin: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin: ; SDAG-GFX12-TRUE16: ; %bb.0: @@ -95,15 +104,22 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX11-LABEL: basic_smax_smin: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin: ; GISEL-GFX12-TRUE16: ; %bb.0: @@ -169,18 +185,31 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; SDAG-GFX9-NEXT: s_endpgm ; -; SDAG-GFX11-LABEL: basic_smax_smin_sgpr: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_med3_i16 v0, s2, 0, 0xff -; SDAG-GFX11-NEXT: v_med3_i16 v1, s3, 0, 0xff -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SDAG-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; SDAG-GFX11-NEXT: global_store_b32 v2, v0, s[0:1] -; SDAG-GFX11-NEXT: s_endpgm +; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_sgpr: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, s2, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v1.l, s3, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; SDAG-GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] +; SDAG-GFX11-TRUE16-NEXT: s_endpgm +; +; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_sgpr: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, s2, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, s3, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; SDAG-GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1] +; SDAG-GFX11-FAKE16-NEXT: s_endpgm ; ; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX12-TRUE16: ; %bb.0: @@ -320,14 +349,21 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) { ; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX11-LABEL: basic_smin_smax: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: basic_smin_smax: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: basic_smin_smax: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-TRUE16-LABEL: basic_smin_smax: ; SDAG-GFX12-TRUE16: ; %bb.0: @@ -374,15 +410,22 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX11-LABEL: basic_smin_smax: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-TRUE16-LABEL: basic_smin_smax: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: basic_smin_smax: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX12-TRUE16-LABEL: basic_smin_smax: ; GISEL-GFX12-TRUE16: ; %bb.0: @@ -440,14 +483,21 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) { ; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX11-LABEL: basic_smin_smax_combined: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: basic_smin_smax_combined: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: basic_smin_smax_combined: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-TRUE16-LABEL: basic_smin_smax_combined: ; SDAG-GFX12-TRUE16: ; %bb.0: @@ -494,15 +544,22 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX11-LABEL: basic_smin_smax_combined: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-TRUE16-LABEL: basic_smin_smax_combined: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: basic_smin_smax_combined: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX12-TRUE16-LABEL: basic_smin_smax_combined: ; GISEL-GFX12-TRUE16: ; %bb.0: @@ -886,15 +943,25 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { ; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: basic_smax_smin_bit_or: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_bit_or: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_bit_or: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_bit_or: ; SDAG-GFX12-TRUE16: ; %bb.0: @@ -945,6 +1012,26 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin_bit_or: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin_bit_or: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_bit_or: ; GISEL-GFX12-TRUE16: ; %bb.0: ; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1001,15 +1088,25 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { ; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: basic_umax_umin_bit_or: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_u16 v1, 0xff, v1 -; GFX11-NEXT: v_min_u16 v0, 0xff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: basic_umax_umin_bit_or: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_min_u16 v0.h, 0xff, v1.l +; SDAG-GFX11-TRUE16-NEXT: v_min_u16 v0.l, 0xff, v0.l +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: basic_umax_umin_bit_or: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_min_u16 v1, 0xff, v1 +; SDAG-GFX11-FAKE16-NEXT: v_min_u16 v0, 0xff, v0 +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-TRUE16-LABEL: basic_umax_umin_bit_or: ; SDAG-GFX12-TRUE16: ; %bb.0: @@ -1057,6 +1154,26 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-TRUE16-LABEL: basic_umax_umin_bit_or: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_min_u16 v0.h, 0xff, v1.l +; GISEL-GFX11-TRUE16-NEXT: v_min_u16 v0.l, 0xff, v0.l +; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: basic_umax_umin_bit_or: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_min_u16 v1, 0xff, v1 +; GISEL-GFX11-FAKE16-NEXT: v_min_u16 v0, 0xff, v0 +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX12-TRUE16-LABEL: basic_umax_umin_bit_or: ; GISEL-GFX12-TRUE16: ; %bb.0: ; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1116,17 +1233,29 @@ define i16 @basic_smax_smin_vec_cast(i16 %src0, i16 %src1) { ; SDAG-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX11-LABEL: basic_smax_smin_vec_cast: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_vec_cast: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; SDAG-GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_vec_cast: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_vec_cast: ; SDAG-GFX12-TRUE16: ; %bb.0: @@ -1181,15 +1310,25 @@ define i16 @basic_smax_smin_vec_cast(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX11-LABEL: basic_smax_smin_vec_cast: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin_vec_cast: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin_vec_cast: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_vec_cast: ; GISEL-GFX12-TRUE16: ; %bb.0: @@ -1250,15 +1389,25 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { ; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: basic_smax_smin_bit_shl: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_i16 v1, v1, 0 -; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_bit_shl: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_max_i16 v0.h, v1.l, 0 +; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_bit_shl: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_max_i16 v1, v1, 0 +; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_bit_shl: ; SDAG-GFX12-TRUE16: ; %bb.0: @@ -1308,6 +1457,26 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin_bit_shl: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_max_i16 v0.h, v1.l, 0 +; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin_bit_shl: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_max_i16 v1, v1, 0 +; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_bit_shl: ; GISEL-GFX12-TRUE16: ; %bb.0: ; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1367,17 +1536,28 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) { ; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX11-LABEL: basic_smax_smin_vec_input: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 -; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_vec_input: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_pk_max_i16 v1, v0, 0 +; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_vec_input: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input: ; SDAG-GFX12-TRUE16: ; %bb.0: @@ -1434,20 +1614,34 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) { ; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX11-LABEL: basic_smax_smin_vec_input: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 -; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin_vec_input: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-TRUE16-NEXT: v_pk_max_i16 v1, 0, v0 +; GISEL-GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l +; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l +; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin_vec_input: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input: ; GISEL-GFX12-TRUE16: ; %bb.0: @@ -1516,17 +1710,28 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { ; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX11-LABEL: basic_smax_smin_vec_input_rev: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] -; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_pk_min_i16 v1, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input_rev: ; SDAG-GFX12-TRUE16: ; %bb.0: @@ -1582,20 +1787,31 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { ; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX11-LABEL: basic_smax_smin_vec_input_rev: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 -; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-GFX11-TRUE16: ; %bb.0: +; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-TRUE16-NEXT: v_pk_min_i16 v1, 0xff00ff, v0 +; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h +; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-GFX11-FAKE16: ; %bb.0: +; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input_rev: ; GISEL-GFX12-TRUE16: ; %bb.0: @@ -1638,3 +1854,5 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { ret i16 %cast } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}}