diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 1c7ee724fef09..ff2595ef51869 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1355,6 +1355,10 @@ def FeatureLshlAddU64Inst : SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true", "Has v_lshl_add_u64 instruction">; +def FeatureAddSubU64Insts + : SubtargetFeature<"add-sub-u64-insts", "HasAddSubU64Insts", "true", + "Has v_add_u64 and v_sub_u64 instructions">; + def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts", "HasVMemToLDSLoad", "true", @@ -2010,6 +2014,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureMemoryAtomicFAddF32DenormalSupport, FeatureKernargPreload, FeatureLshlAddU64Inst, + FeatureAddSubU64Insts, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, ]>; @@ -2787,6 +2792,9 @@ def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">, def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>; +def HasAddSubU64Insts : Predicate<"Subtarget->hasAddSubU64Insts()">, + AssemblerPredicate<(all_of FeatureAddSubU64Insts)>; + def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">, AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index e7bf88d2ee5b6..fedfa3f9dd900 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4208,6 +4208,9 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, assert(Ty.isScalar()); unsigned Size = Ty.getSizeInBits(); + if (ST.hasVectorMulU64() && Size == 64) + return true; + unsigned NumParts = Size / 32; assert((Size % 32) == 0); assert(NumParts >= 2); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index f1caf2478e630..9b05f7c339738 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2528,7 +2528,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // Special case for s_mul_u64. There is not a vector equivalent of // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector // multiplications. - if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) { + if (!Subtarget.hasVectorMulU64() && Opc == AMDGPU::G_MUL && + DstTy.getSizeInBits() == 64) { applyMappingSMULU64(B, OpdMapper); return; } @@ -3973,7 +3974,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; } else { - OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); + if (MI.getOpcode() == AMDGPU::G_MUL && Subtarget.hasVectorMulU64()) + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + else + OpdsMapping[0] = + getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 8b758b011f6ad..5eddde1f72ec7 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -267,6 +267,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMinimum3Maximum3F16 = false; bool HasMinimum3Maximum3PKF16 = false; bool HasLshlAddU64Inst = false; + bool HasAddSubU64Insts = false; bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; @@ -1500,6 +1501,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasVOPD3() const { return GFX1250Insts; } + // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions. + bool hasAddSubU64Insts() const { return HasAddSubU64Insts; } + + // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. + bool hasVectorMulU64() const { return GFX1250Insts; } + // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions. bool hasPkAddMinMaxInsts() const { return GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 441034b508c10..92a56a1d5f492 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -874,7 +874,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); - if (Subtarget->hasScalarSMulU64()) + if (Subtarget->hasVectorMulU64()) + setOperationAction(ISD::MUL, MVT::i64, Legal); + else if (Subtarget->hasScalarSMulU64()) setOperationAction(ISD::MUL, MVT::i64, Custom); if (Subtarget->hasMad64_32()) @@ -5421,6 +5423,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineOperand &Src0 = MI.getOperand(1); MachineOperand &Src1 = MI.getOperand(2); + if (ST.hasAddSubU64Insts()) { + auto I = BuildMI(*BB, MI, DL, + TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64 + : AMDGPU::V_SUB_U64_e64), + Dest.getReg()) + .add(Src0) + .add(Src1) + .addImm(0); // clamp + TII->legalizeOperands(*I); + MI.eraseFromParent(); + return BB; + } + if (IsAdd && ST.hasLshlAddU64Inst()) { auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), Dest.getReg()) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 571f3efd68260..40e687178fb01 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7361,6 +7361,10 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } case AMDGPU::S_MUL_U64: + if (ST.hasVectorMulU64()) { + NewOpcode = AMDGPU::V_MUL_U64_e64; + break; + } // Split s_mul_u64 in 32-bit vector multiplications. splitScalarSMulU64(Worklist, Inst, MDT); Inst.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b8537513ce986..485ca78db93a7 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2914,6 +2914,7 @@ def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>; def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>; +def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>; def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 030a6e1e978c1..550ec9d3f55ab 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -925,6 +925,17 @@ let isAdd = 1 in { defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32">; } +let isReMaterializable = 1 in { +let SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] in { +defm V_ADD_U64 : VOP2Inst <"v_add_nc_u64", VOP_I64_I64_I64_ARITH>; +// We don't actually have something like V_SUBREV_U64 so V_SUB_U64 can't be treated as commutable. +let isCommutable = 0 in +defm V_SUB_U64 : VOP2Inst <"v_sub_nc_u64", VOP_I64_I64_I64_ARITH>; +} // End SubtargetPredicate = HasAddSubU64Insts, SchedRW = [Write64Bit] +let SubtargetPredicate = isGFX1250Plus, SchedRW = [WriteDouble] in +defm V_MUL_U64 : VOP2Inst <"v_mul_u64", VOP_I64_I64_I64, DivergentBinFrag>; +} // End isReMaterializable = 1 + } // End isCommutable = 1 // These are special and do not read the exec mask. @@ -1754,6 +1765,9 @@ multiclass VOP2_Real_FULL_with_name op, string opName, VOP2_Realtriple_e64_with_name, VOP2_Real_NO_VOP3_with_name; +multiclass VOP2_Real_NO_DPP op> : + VOP2_Real_e32, VOP2_Real_e64; + multiclass VOP2_Real_NO_DPP_with_name op, string opName, string asmName> { defm NAME : VOP2_Real_e32_with_name, @@ -1843,6 +1857,9 @@ defm V_FMAC_F64 : VOP2_Real_FULL; defm V_FMAMK_F64 : VOP2Only_Real_MADK64; defm V_FMAAK_F64 : VOP2Only_Real_MADK64; +defm V_ADD_U64 : VOP2_Real_FULL; +defm V_SUB_U64 : VOP2_Real_FULL; +defm V_MUL_U64 : VOP2_Real_NO_DPP; //===----------------------------------------------------------------------===// // GFX11. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index f7f7e9645fa62..0d571d0e563b5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -6,6 +6,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16, -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16, -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; GCN-LABEL: s_mul_i16: @@ -22,6 +23,11 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mul_i32 s0, s0, s1 ; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_mul_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mul_i32 s0, s0, s1 +; GFX1250-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -74,6 +80,13 @@ define i16 @v_mul_i16(i16 %num, i16 %den) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_mul_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -109,6 +122,13 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_mul_i16_zeroext: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mul_i32 s0, s0, s1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX1250-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -165,6 +185,15 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_mul_i16_zeroext: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -188,6 +217,13 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_sext_i32_i16 s0, s0 ; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_mul_i16_signext: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mul_i32 s0, s0, s1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_sext_i32_i16 s0, s0 +; GFX1250-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -248,6 +284,15 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_mul_i16_signext: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -267,6 +312,11 @@ define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mul_i32 s0, s0, s1 ; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_mul_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mul_i32 s0, s0, s1 +; GFX1250-NEXT: ; return to shader part epilog %result = mul i32 %num, %den ret i32 %result } @@ -293,6 +343,13 @@ define i32 @v_mul_i32(i32 %num, i32 %den) { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_mul_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i32 %num, %den ret i32 %result } @@ -315,6 +372,12 @@ define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %d ; GFX12-NEXT: s_mul_i32 s0, s0, s2 ; GFX12-NEXT: s_mul_i32 s1, s1, s3 ; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_mul_v2i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mul_i32 s0, s0, s2 +; GFX1250-NEXT: s_mul_i32 s1, s1, s3 +; GFX1250-NEXT: ; return to shader part epilog %result = mul <2 x i32> %num, %den ret <2 x i32> %result } @@ -344,6 +407,14 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) { ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_mul_v2i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul <2 x i32> %num, %den ret <2 x i32> %result } @@ -400,6 +471,11 @@ define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_mul_i33: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] +; GFX1250-NEXT: ; return to shader part epilog %result = mul i33 %num, %den ret i33 %result } @@ -456,6 +532,11 @@ define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] ; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_mul_i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] +; GFX1250-NEXT: ; return to shader part epilog %result = mul i64 %num, %den ret i64 %result } @@ -504,6 +585,13 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_mul_i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i64 %num, %den ret i64 %result } @@ -620,6 +708,26 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) { ; GFX12-NEXT: s_add_co_ci_u32 s2, s3, s0 ; GFX12-NEXT: s_mov_b32 s0, s5 ; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_mul_i96: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mul_i32 s6, s0, s5 +; GFX1250-NEXT: s_mul_i32 s7, s1, s4 +; GFX1250-NEXT: s_mul_i32 s2, s2, s3 +; GFX1250-NEXT: s_add_co_i32 s6, s6, s7 +; GFX1250-NEXT: s_mul_hi_u32 s7, s0, s3 +; GFX1250-NEXT: s_add_co_i32 s6, s6, s2 +; GFX1250-NEXT: s_mul_i32 s2, s0, s4 +; GFX1250-NEXT: s_mul_i32 s5, s0, s3 +; GFX1250-NEXT: s_mul_hi_u32 s0, s0, s4 +; GFX1250-NEXT: s_add_co_u32 s2, s2, s7 +; GFX1250-NEXT: s_mul_i32 s4, s1, s3 +; GFX1250-NEXT: s_add_co_ci_u32 s0, s0, s6 +; GFX1250-NEXT: s_mul_hi_u32 s3, s1, s3 +; GFX1250-NEXT: s_add_co_u32 s1, s4, s2 +; GFX1250-NEXT: s_add_co_ci_u32 s2, s3, s0 +; GFX1250-NEXT: s_mov_b32 s0, s5 +; GFX1250-NEXT: ; return to shader part epilog %result = mul i96 %num, %den %cast = bitcast i96 %result to <3 x i32> ret <3 x i32> %cast @@ -686,6 +794,25 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2] ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_mul_i96: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mul_lo_u32 v0, v6, v5 +; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9] +; GFX1250-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v11, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v6, v4, v[10:11] +; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v3, v[4:5] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i96 %num, %den ret i96 %result } @@ -895,6 +1022,42 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) { ; GFX12-NEXT: s_mov_b32 s1, s8 ; GFX12-NEXT: s_mov_b32 s2, s7 ; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_mul_i128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mul_i32 s9, s0, s6 +; GFX1250-NEXT: s_mul_i32 s11, s1, s5 +; GFX1250-NEXT: s_mul_hi_u32 s10, s0, s6 +; GFX1250-NEXT: s_mul_hi_u32 s12, s1, s5 +; GFX1250-NEXT: s_add_co_u32 s9, s11, s9 +; GFX1250-NEXT: s_mul_i32 s11, s2, s4 +; GFX1250-NEXT: s_add_co_ci_u32 s10, s12, s10 +; GFX1250-NEXT: s_mul_hi_u32 s12, s2, s4 +; GFX1250-NEXT: s_mul_hi_u32 s8, s0, s4 +; GFX1250-NEXT: s_add_co_u32 s9, s11, s9 +; GFX1250-NEXT: s_mul_i32 s11, s0, s5 +; GFX1250-NEXT: s_add_co_ci_u32 s10, s12, s10 +; GFX1250-NEXT: s_mul_hi_u32 s12, s0, s5 +; GFX1250-NEXT: s_add_co_u32 s8, s11, s8 +; GFX1250-NEXT: s_add_co_ci_u32 s9, s12, s9 +; GFX1250-NEXT: s_mul_i32 s12, s1, s4 +; GFX1250-NEXT: s_mul_hi_u32 s13, s1, s4 +; GFX1250-NEXT: s_cselect_b32 s11, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s8, s12, s8 +; GFX1250-NEXT: s_mul_i32 s12, s0, s7 +; GFX1250-NEXT: s_add_co_ci_u32 s7, s13, s9 +; GFX1250-NEXT: s_add_co_ci_u32 s9, s10, s12 +; GFX1250-NEXT: s_mul_i32 s1, s1, s6 +; GFX1250-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1250-NEXT: s_mul_i32 s2, s2, s5 +; GFX1250-NEXT: s_add_co_ci_u32 s1, s9, s1 +; GFX1250-NEXT: s_mul_i32 s3, s3, s4 +; GFX1250-NEXT: s_add_co_i32 s1, s1, s2 +; GFX1250-NEXT: s_mul_i32 s0, s0, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s1, s3 +; GFX1250-NEXT: s_mov_b32 s1, s8 +; GFX1250-NEXT: s_mov_b32 s2, s7 +; GFX1250-NEXT: ; return to shader part epilog %result = mul i128 %num, %den %cast = bitcast i128 %result to <4 x i32> ret <4 x i32> %cast @@ -1036,6 +1199,39 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_mul_i128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v9, v5, v[0:1] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], null, v2, v4, v[10:11] +; GFX1250-NEXT: v_mov_b32_e32 v12, v1 +; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v13, v10 +; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13] +; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13] +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0 +; GFX1250-NEXT: s_wait_alu 0xfffd +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v8, v1, vcc_lo +; GFX1250-NEXT: v_mov_b32_e32 v1, v6 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v5, v[8:9] +; GFX1250-NEXT: v_mov_b32_e32 v2, v7 +; GFX1250-NEXT: v_mad_co_u64_u32 v[4:5], null, v3, v4, v[8:9] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i128 %num, %den ret i128 %result } @@ -2020,6 +2216,185 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX12-NEXT: s_add_co_i32 s7, s1, s7 ; GFX12-NEXT: s_mov_b32 s1, s16 ; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_mul_i256: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mul_i32 s17, s0, s10 +; GFX1250-NEXT: s_mul_i32 s19, s1, s9 +; GFX1250-NEXT: s_mul_hi_u32 s18, s0, s10 +; GFX1250-NEXT: s_mul_hi_u32 s20, s1, s9 +; GFX1250-NEXT: s_add_co_u32 s17, s19, s17 +; GFX1250-NEXT: s_add_co_ci_u32 s18, s20, s18 +; GFX1250-NEXT: s_mul_i32 s20, s2, s8 +; GFX1250-NEXT: s_mul_hi_u32 s21, s2, s8 +; GFX1250-NEXT: s_cselect_b32 s19, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s17, s20, s17 +; GFX1250-NEXT: s_mul_hi_u32 s16, s0, s8 +; GFX1250-NEXT: s_add_co_ci_u32 s18, s21, s18 +; GFX1250-NEXT: s_mul_i32 s21, s0, s9 +; GFX1250-NEXT: s_mul_hi_u32 s22, s0, s9 +; GFX1250-NEXT: s_cselect_b32 s20, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s16, s21, s16 +; GFX1250-NEXT: s_add_co_ci_u32 s17, s22, s17 +; GFX1250-NEXT: s_mul_i32 s22, s1, s8 +; GFX1250-NEXT: s_mul_hi_u32 s23, s1, s8 +; GFX1250-NEXT: s_cselect_b32 s21, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s16, s22, s16 +; GFX1250-NEXT: s_add_co_ci_u32 s17, s23, s17 +; GFX1250-NEXT: s_mul_i32 s23, s0, s12 +; GFX1250-NEXT: s_mul_i32 s25, s1, s11 +; GFX1250-NEXT: s_mul_hi_u32 s24, s0, s12 +; GFX1250-NEXT: s_mul_hi_u32 s26, s1, s11 +; GFX1250-NEXT: s_cselect_b32 s22, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s23, s25, s23 +; GFX1250-NEXT: s_add_co_ci_u32 s24, s26, s24 +; GFX1250-NEXT: s_mul_i32 s26, s2, s10 +; GFX1250-NEXT: s_mul_hi_u32 s27, s2, s10 +; GFX1250-NEXT: s_cselect_b32 s25, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s23, s26, s23 +; GFX1250-NEXT: s_add_co_ci_u32 s24, s27, s24 +; GFX1250-NEXT: s_mul_i32 s27, s3, s9 +; GFX1250-NEXT: s_mul_hi_u32 s28, s3, s9 +; GFX1250-NEXT: s_cselect_b32 s26, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s23, s27, s23 +; GFX1250-NEXT: s_add_co_ci_u32 s24, s28, s24 +; GFX1250-NEXT: s_mul_i32 s28, s4, s8 +; GFX1250-NEXT: s_mul_hi_u32 s29, s4, s8 +; GFX1250-NEXT: s_cselect_b32 s27, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s23, s28, s23 +; GFX1250-NEXT: s_add_co_ci_u32 s24, s29, s24 +; GFX1250-NEXT: s_mul_i32 s29, s0, s11 +; GFX1250-NEXT: s_mul_hi_u32 s30, s0, s11 +; GFX1250-NEXT: s_cselect_b32 s28, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s18, s29, s18 +; GFX1250-NEXT: s_add_co_ci_u32 s23, s30, s23 +; GFX1250-NEXT: s_mul_i32 s30, s1, s10 +; GFX1250-NEXT: s_mul_hi_u32 s31, s1, s10 +; GFX1250-NEXT: s_cselect_b32 s29, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s18, s30, s18 +; GFX1250-NEXT: s_add_co_ci_u32 s23, s31, s23 +; GFX1250-NEXT: s_mul_i32 s31, s2, s9 +; GFX1250-NEXT: s_mul_hi_u32 s33, s2, s9 +; GFX1250-NEXT: s_cselect_b32 s30, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s18, s31, s18 +; GFX1250-NEXT: s_add_co_ci_u32 s23, s33, s23 +; GFX1250-NEXT: s_mul_i32 s33, s3, s8 +; GFX1250-NEXT: s_mul_hi_u32 s34, s3, s8 +; GFX1250-NEXT: s_cselect_b32 s31, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s18, s33, s18 +; GFX1250-NEXT: s_add_co_ci_u32 s23, s34, s23 +; GFX1250-NEXT: s_cselect_b32 s33, 1, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s22, 0 +; GFX1250-NEXT: s_mul_hi_u32 s22, s0, s14 +; GFX1250-NEXT: s_add_co_ci_u32 s18, s21, s18 +; GFX1250-NEXT: s_cselect_b32 s21, 1, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s20, 0 +; GFX1250-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX1250-NEXT: s_add_co_ci_u32 s19, s19, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s21, 0 +; GFX1250-NEXT: s_mul_i32 s21, s0, s14 +; GFX1250-NEXT: s_add_co_ci_u32 s19, s19, s23 +; GFX1250-NEXT: s_mul_i32 s23, s1, s13 +; GFX1250-NEXT: s_cselect_b32 s20, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s21, s23, s21 +; GFX1250-NEXT: s_mul_i32 s23, s2, s12 +; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX1250-NEXT: s_mul_hi_u32 s34, s2, s12 +; GFX1250-NEXT: s_add_co_u32 s21, s23, s21 +; GFX1250-NEXT: s_mul_i32 s23, s3, s11 +; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX1250-NEXT: s_mul_hi_u32 s34, s3, s11 +; GFX1250-NEXT: s_add_co_u32 s21, s23, s21 +; GFX1250-NEXT: s_mul_i32 s23, s4, s10 +; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX1250-NEXT: s_mul_hi_u32 s34, s4, s10 +; GFX1250-NEXT: s_add_co_u32 s21, s23, s21 +; GFX1250-NEXT: s_mul_i32 s23, s5, s9 +; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX1250-NEXT: s_mul_hi_u32 s34, s5, s9 +; GFX1250-NEXT: s_add_co_u32 s21, s23, s21 +; GFX1250-NEXT: s_mul_i32 s23, s6, s8 +; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX1250-NEXT: s_mul_hi_u32 s34, s6, s8 +; GFX1250-NEXT: s_add_co_u32 s21, s23, s21 +; GFX1250-NEXT: s_mul_i32 s23, s0, s13 +; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX1250-NEXT: s_mul_hi_u32 s34, s0, s13 +; GFX1250-NEXT: s_add_co_u32 s23, s23, s24 +; GFX1250-NEXT: s_add_co_ci_u32 s21, s34, s21 +; GFX1250-NEXT: s_mul_i32 s34, s1, s12 +; GFX1250-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX1250-NEXT: s_cselect_b32 s24, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s23, s34, s23 +; GFX1250-NEXT: s_add_co_ci_u32 s21, s35, s21 +; GFX1250-NEXT: s_mul_i32 s35, s2, s11 +; GFX1250-NEXT: s_mul_hi_u32 s36, s2, s11 +; GFX1250-NEXT: s_cselect_b32 s34, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s23, s35, s23 +; GFX1250-NEXT: s_add_co_ci_u32 s21, s36, s21 +; GFX1250-NEXT: s_mul_i32 s36, s3, s10 +; GFX1250-NEXT: s_mul_hi_u32 s37, s3, s10 +; GFX1250-NEXT: s_cselect_b32 s35, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s23, s36, s23 +; GFX1250-NEXT: s_add_co_ci_u32 s21, s37, s21 +; GFX1250-NEXT: s_mul_i32 s37, s4, s9 +; GFX1250-NEXT: s_mul_hi_u32 s38, s4, s9 +; GFX1250-NEXT: s_cselect_b32 s36, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s23, s37, s23 +; GFX1250-NEXT: s_add_co_ci_u32 s21, s38, s21 +; GFX1250-NEXT: s_mul_i32 s38, s5, s8 +; GFX1250-NEXT: s_mul_hi_u32 s39, s5, s8 +; GFX1250-NEXT: s_cselect_b32 s37, 1, 0 +; GFX1250-NEXT: s_add_co_u32 s23, s38, s23 +; GFX1250-NEXT: s_add_co_ci_u32 s21, s39, s21 +; GFX1250-NEXT: s_cselect_b32 s38, 1, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s30, 0 +; GFX1250-NEXT: s_mul_i32 s1, s1, s14 +; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s31, 0 +; GFX1250-NEXT: s_mul_i32 s2, s2, s13 +; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s33, 0 +; GFX1250-NEXT: s_mul_i32 s3, s3, s12 +; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s20, 0 +; GFX1250-NEXT: s_mul_i32 s4, s4, s11 +; GFX1250-NEXT: s_add_co_ci_u32 s20, s29, s23 +; GFX1250-NEXT: s_cselect_b32 s23, 1, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s26, 0 +; GFX1250-NEXT: s_mul_i32 s26, s0, s15 +; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s27, 0 +; GFX1250-NEXT: s_mul_i32 s5, s5, s10 +; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s28, 0 +; GFX1250-NEXT: s_mul_i32 s6, s6, s9 +; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s23, 0 +; GFX1250-NEXT: s_mul_i32 s7, s7, s8 +; GFX1250-NEXT: s_add_co_ci_u32 s15, s25, s21 +; GFX1250-NEXT: s_add_co_ci_u32 s21, s22, s26 +; GFX1250-NEXT: s_cmp_lg_u32 s38, 0 +; GFX1250-NEXT: s_mul_i32 s0, s0, s8 +; GFX1250-NEXT: s_add_co_ci_u32 s1, s21, s1 +; GFX1250-NEXT: s_cmp_lg_u32 s37, 0 +; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s2 +; GFX1250-NEXT: s_cmp_lg_u32 s36, 0 +; GFX1250-NEXT: s_mov_b32 s2, s17 +; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GFX1250-NEXT: s_cmp_lg_u32 s35, 0 +; GFX1250-NEXT: s_mov_b32 s3, s18 +; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s4 +; GFX1250-NEXT: s_cmp_lg_u32 s34, 0 +; GFX1250-NEXT: s_mov_b32 s4, s19 +; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s5 +; GFX1250-NEXT: s_cmp_lg_u32 s24, 0 +; GFX1250-NEXT: s_mov_b32 s5, s20 +; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s6 +; GFX1250-NEXT: s_mov_b32 s6, s15 +; GFX1250-NEXT: s_add_co_i32 s7, s1, s7 +; GFX1250-NEXT: s_mov_b32 s1, s16 +; GFX1250-NEXT: ; return to shader part epilog %result = mul i256 %num, %den %cast = bitcast i256 %result to <8 x i32> ret <8 x i32> %cast @@ -2478,6 +2853,107 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_mul_i256: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v14, 0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], null, v0, v12, 0 +; GFX1250-NEXT: v_mul_lo_u32 v26, v6, v9 +; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v1, v13, v[16:17] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19] +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v2, v12, v[16:17] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] +; GFX1250-NEXT: s_wait_alu 0xfffd +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], null, v0, v10, 0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v3, v11, v[16:17] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] +; GFX1250-NEXT: s_wait_alu 0xfffd +; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo +; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v4, v10, v[16:17] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v5, v9, v[16:17] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[16:17] +; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21] +; GFX1250-NEXT: v_mov_b32_e32 v20, v19 +; GFX1250-NEXT: s_wait_alu 0xfffd +; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v24, vcc_lo +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_cndmask_b32_e64 v19, 0, 1, s0 +; GFX1250-NEXT: v_mov_b32_e32 v21, v22 +; GFX1250-NEXT: v_mul_lo_u32 v22, v5, v10 +; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v2, v8, v[16:17] +; GFX1250-NEXT: s_wait_alu 0xfffd +; GFX1250-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v19, vcc_lo +; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v0, v13, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v25 +; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11 +; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], vcc_lo, v1, v12, v[16:17] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21] +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2 +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v8, 0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21] +; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13 +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2 +; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19] +; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24 +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2 +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19] +; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13] +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 +; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2 +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11] +; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19] +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2 +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v27, v13, s2 +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2 +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v6, v11, s2 +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1 +; GFX1250-NEXT: s_wait_alu 0xfffd +; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v26, s0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v8, v[0:1] +; GFX1250-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v7, v8 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i256 %num, %den ret i256 %result } @@ -2536,6 +3012,14 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: s_mul_u64_zext_with_vregs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_load_b32 v2, v[2:3], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = zext i32 %val to i64 %mul = mul i64 %ext, 80 @@ -2632,6 +3116,21 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: s_mul_u64_zext_with_sregs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = zext i32 %val to i64 %mul = mul i64 %ext, 80 @@ -2704,6 +3203,14 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: s_mul_u64_sext_with_vregs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_load_b32 v2, v[2:3], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = sext i32 %val to i64 %mul = mul i64 %ext, 80 @@ -2815,6 +3322,20 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: s_mul_u64_sext_with_sregs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_ashr_i32 s3, s2, 31 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = sext i32 %val to i64 %mul = mul i64 %ext, 80 diff --git a/llvm/test/CodeGen/AMDGPU/add_u64.ll b/llvm/test/CodeGen/AMDGPU/add_u64.ll new file mode 100644 index 0000000000000..0373027201378 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/add_u64.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s + +define amdgpu_ps <2 x float> @test_add_u64_vv(i64 %a, i64 %b) { +; GFX12-LABEL: test_add_u64_vv: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_add_u64_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: ; return to shader part epilog + %add = add i64 %a, %b + %ret = bitcast i64 %add to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_add_u64_vs(i64 %a, i64 inreg %b) { +; GFX12-LABEL: test_add_u64_vs: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v1, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_add_u64_vs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] +; GFX1250-NEXT: ; return to shader part epilog + %add = add i64 %a, %b + %ret = bitcast i64 %add to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_add_u64_sv(i64 inreg %a, i64 %b) { +; GFX12-LABEL: test_add_u64_sv: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v1, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_add_u64_sv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] +; GFX1250-NEXT: ; return to shader part epilog + %add = add i64 %a, %b + %ret = bitcast i64 %add to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_add_u64_ss(i64 inreg %a, i64 inreg %b) { +; GCN-LABEL: test_add_u64_ss: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog + %add = add i64 %a, %b + %ret = bitcast i64 %add to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_add_u64_v_inline_lit(i64 %a) { +; GFX12-LABEL: test_add_u64_v_inline_lit: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_add_u64_v_inline_lit: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 5, v[0:1] +; GFX1250-NEXT: ; return to shader part epilog + %add = add i64 %a, 5 + %ret = bitcast i64 %add to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_add_u64_v_small_imm(i64 %a) { +; GFX12-LABEL: test_add_u64_v_small_imm: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x1f4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_add_u64_v_small_imm: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0x1f4, v[0:1] +; GFX1250-NEXT: ; return to shader part epilog + %add = add i64 %a, 500 + %ret = bitcast i64 %add to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_add_u64_v_64bit_imm(i64 %a) { +; GFX12-LABEL: test_add_u64_v_64bit_imm: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x3b9ac9ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 1, v1, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_add_u64_v_64bit_imm: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0x13b9ac9ff), v[0:1] +; GFX1250-NEXT: ; return to shader part epilog + %add = add i64 %a, 5294967295 + %ret = bitcast i64 %add to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_add_u64_s_small_imm(i64 inreg %a) { +; GCN-LABEL: test_add_u64_s_small_imm: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x1f4 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog + %add = add i64 %a, 500 + %ret = bitcast i64 %add to <2 x float> + ret <2 x float> %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index 95504052249e0..7fec5f71ce8d5 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -152,7 +152,7 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: s_wait_xcnt 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GCN-NEXT: s_mov_b32 s0, exec_lo ; GCN-NEXT: v_cmpx_ne_u32_e32 0, v2 ; GCN-NEXT: s_cbranch_execnz .LBB3_1 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index eff68ce2de11d..4a634520c682e 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -9,6 +9,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W32 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W64 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250 %s ; GCN-ISEL-LABEL: name: sadd64rr ; GCN-ISEL-LABEL: body: @@ -113,6 +114,19 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: sadd64rr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm entry: %add = add i64 %a, %b store i64 %add, ptr addrspace(1) %out @@ -211,6 +225,17 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: sadd64ri: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], lit64(0x123456789876) +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm entry: %add = add i64 20015998343286, %a store i64 %add, ptr addrspace(1) %out @@ -301,6 +326,17 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: vadd64rr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1] +; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] +; GFX1250-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -391,6 +427,17 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: vadd64ri: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], lit64(0x123456789876), v[0:1] +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] +; GFX1250-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -486,6 +533,18 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: suaddo32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s0, s0, s1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 %carry = extractvalue { i32, i1 } %uadd, 1 @@ -606,6 +665,21 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: uaddo32_vcc_user: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_add_co_u32 v1, s4, s6, s7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX1250-NEXT: s_endpgm %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 %carry = extractvalue { i32, i1 } %uadd, 1 @@ -741,6 +815,22 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: suaddo64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5] +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX1250-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 %carry = extractvalue { i64, i1 } %uadd, 1 @@ -874,6 +964,23 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: vuaddo64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1] +; GFX1250-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[2:3] +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] +; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3] +; GFX1250-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext) @@ -987,6 +1094,19 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ssub64rr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm entry: %sub = sub i64 %a, %b store i64 %sub, ptr addrspace(1) %out @@ -1085,6 +1205,17 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: ssub64ri: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_sub_nc_u64 s[2:3], lit64(0x123456789876), s[2:3] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm entry: %sub = sub i64 20015998343286, %a store i64 %sub, ptr addrspace(1) %out @@ -1175,6 +1306,17 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: vsub64rr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[2:3], v[0:1] +; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] +; GFX1250-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1265,6 +1407,17 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: vsub64ri: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], lit64(0x123456789876), v[0:1] +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] +; GFX1250-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1361,6 +1514,18 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: susubo32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 %carry = extractvalue { i32, i1 } %usub, 1 @@ -1481,6 +1646,21 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: usubo32_vcc_user: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_sub_co_u32 v1, s4, s6, s7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX1250-NEXT: s_endpgm %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 %carry = extractvalue { i32, i1 } %usub, 1 @@ -1616,6 +1796,22 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: susubo64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_sub_nc_u64 s[6:7], s[4:5], s[6:7] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5] +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX1250-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %usub, 0 %carry = extractvalue { i64, i1 } %usub, 1 @@ -1749,6 +1945,23 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX1250-LABEL: vusubo64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[6:7], v[0:1] +; GFX1250-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[2:3] +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] +; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3] +; GFX1250-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext) @@ -2904,6 +3117,191 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: .LBB16_4: ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-NEXT: s_branch .LBB16_2 +; +; GFX1250-LABEL: sudiv64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_b64 s[0:1], s[0:1], lit64(0xffffffff00000000) +; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4 +; GFX1250-NEXT: ; %bb.1: +; GFX1250-NEXT: s_cvt_f32_u32 s0, s2 +; GFX1250-NEXT: s_cvt_f32_u32 s1, s3 +; GFX1250-NEXT: s_sub_nc_u64 s[6:7], 0, s[2:3] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX1250-NEXT: s_fmac_f32 s0, s1, 0x4f800000 +; GFX1250-NEXT: v_s_rcp_f32 s0, s0 +; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX1250-NEXT: s_mul_f32 s0, s0, 0x5f7ffffc +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_mul_f32 s1, s0, 0x2f800000 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2) +; GFX1250-NEXT: s_trunc_f32 s1, s1 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_fmac_f32 s0, s1, 0xcf800000 +; GFX1250-NEXT: s_cvt_u32_f32 s5, s1 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_cvt_u32_f32 s4, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_mul_u64 s[12:13], s[6:7], s[4:5] +; GFX1250-NEXT: s_mul_hi_u32 s15, s4, s13 +; GFX1250-NEXT: s_mul_i32 s14, s4, s13 +; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s12 +; GFX1250-NEXT: s_mul_i32 s17, s5, s12 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[0:1], s[14:15] +; GFX1250-NEXT: s_mul_hi_u32 s16, s5, s12 +; GFX1250-NEXT: s_mul_hi_u32 s18, s5, s13 +; GFX1250-NEXT: s_add_co_u32 s0, s14, s17 +; GFX1250-NEXT: s_add_co_ci_u32 s0, s15, s16 +; GFX1250-NEXT: s_mul_i32 s12, s5, s13 +; GFX1250-NEXT: s_add_co_ci_u32 s13, s18, 0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_add_co_u32 v0, s0, s4, s12 +; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-NEXT: s_add_co_ci_u32 s5, s5, s13 +; GFX1250-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_mul_u64 s[6:7], s[6:7], s[4:5] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_mul_hi_u32 s13, s4, s7 +; GFX1250-NEXT: s_mul_i32 s12, s4, s7 +; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s6 +; GFX1250-NEXT: s_mul_i32 s15, s5, s6 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13] +; GFX1250-NEXT: s_mul_hi_u32 s14, s5, s6 +; GFX1250-NEXT: s_mul_hi_u32 s4, s5, s7 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_add_co_u32 s0, s12, s15 +; GFX1250-NEXT: s_add_co_ci_u32 s0, s13, s14 +; GFX1250-NEXT: s_mul_i32 s6, s5, s7 +; GFX1250-NEXT: s_add_co_ci_u32 s7, s4, 0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[0:1], s[6:7] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_add_co_u32 v0, s0, v0, s6 +; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s7 +; GFX1250-NEXT: v_readfirstlane_b32 s7, v0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_mul_hi_u32 s5, s10, s0 +; GFX1250-NEXT: s_mul_i32 s4, s10, s0 +; GFX1250-NEXT: s_mul_hi_u32 s12, s11, s0 +; GFX1250-NEXT: s_mul_i32 s6, s11, s0 +; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s7 +; GFX1250-NEXT: s_mul_i32 s13, s11, s7 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[4:5] +; GFX1250-NEXT: s_mul_hi_u32 s0, s11, s7 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_add_co_u32 s4, s4, s13 +; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s0 +; GFX1250-NEXT: s_add_co_ci_u32 s7, s12, 0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[6:7] +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_and_b64 s[6:7], s[4:5], lit64(0xffffffff00000000) +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_or_b32 s6, s6, s4 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[6:7] +; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[6:7], 2 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: v_sub_co_u32 v0, s0, s10, s4 +; GFX1250-NEXT: s_sub_co_i32 s4, s11, s5 +; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX1250-NEXT: v_sub_co_u32 v1, s12, v0, s2 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, s3 +; GFX1250-NEXT: s_cmp_lg_u32 s12, 0 +; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], 1 +; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, 0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_cmp_ge_u32 s4, s3 +; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1250-NEXT: s_cmp_eq_u32 s4, s3 +; GFX1250-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s14, v1, vcc_lo +; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 +; GFX1250-NEXT: s_sub_co_ci_u32 s0, s11, s5 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_cmp_ge_u32 s0, s3 +; GFX1250-NEXT: s_wait_alu 0xfffd +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: s_cmp_eq_u32 s0, s3 +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s4, v0, s0 +; GFX1250-NEXT: s_wait_alu 0xfffd +; GFX1250-NEXT: v_cndmask_b32_e32 v2, s12, v2, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s13, v3, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: s_wait_alu 0xfffd +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s7, v1, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s6, v2, vcc_lo +; GFX1250-NEXT: s_cbranch_execnz .LBB16_3 +; GFX1250-NEXT: .LBB16_2: +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1250-NEXT: s_sub_co_i32 s1, 0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX1250-NEXT: v_nop +; GFX1250-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_mul_i32 s1, s1, s0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_mul_hi_u32 s1, s0, s1 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_add_co_i32 s0, s0, s1 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_mul_i32 s1, s0, s2 +; GFX1250-NEXT: s_add_co_i32 s3, s0, 1 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_sub_co_i32 s1, s10, s1 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_sub_co_i32 s4, s1, s2 +; GFX1250-NEXT: s_cmp_ge_u32 s1, s2 +; GFX1250-NEXT: s_cselect_b32 s0, s3, s0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_cselect_b32 s1, s4, s1 +; GFX1250-NEXT: s_add_co_i32 s3, s0, 1 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: s_cmp_ge_u32 s1, s2 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: s_cselect_b32 s0, s3, s0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-NEXT: .LBB16_3: +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9] +; GFX1250-NEXT: s_endpgm +; GFX1250-NEXT: .LBB16_4: +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_branch .LBB16_2 %result = udiv i64 %x, %y store i64 %result, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll index dea9142cf2bee..f9fae025e0bf8 100644 --- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll +++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll @@ -737,7 +737,7 @@ define i64 @v_add_u64_vop2_literal_32(i64 %x) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf] ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf] -; GFX1250-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 0x7b ; encoding: [0x00,0x00,0x52,0xd6,0x00,0x01,0xfd,0x03,0x7b,0x00,0x00,0x00] +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0x7b, v[0:1] ; encoding: [0xff,0x00,0x00,0x50,0x7b,0x00,0x00,0x00] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] %add = add i64 %x, 123 ret i64 %add @@ -747,7 +747,7 @@ define i64 @v_add_u64_vop2_literal_32(i64 %x) { ; GFX10: codeLenInByte = 28 ; GFX1100: codeLenInByte = 32 ; GFX1150: codeLenInByte = 32 -; GFX1250: codeLenInByte = 24 +; GFX1250: codeLenInByte = 20 define i64 @v_add_u64_vop2_literal_64(i64 %x) { ; GFX9-LABEL: v_add_u64_vop2_literal_64: @@ -788,9 +788,7 @@ define i64 @v_add_u64_vop2_literal_64(i64 %x) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf] ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf] -; GFX1250-NEXT: s_mov_b64 s[0:1], lit64(0x112345678) ; encoding: [0xfe,0x01,0x80,0xbe,0x78,0x56,0x34,0x12,0x01,0x00,0x00,0x00] -; GFX1250-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf] -; GFX1250-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; encoding: [0x00,0x00,0x52,0xd6,0x00,0x01,0x01,0x00] +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0x112345678), v[0:1] ; encoding: [0xfe,0x00,0x00,0x50,0x78,0x56,0x34,0x12,0x01,0x00,0x00,0x00] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] %add = add i64 %x, 4600387192 ret i64 %add @@ -800,6 +798,6 @@ define i64 @v_add_u64_vop2_literal_64(i64 %x) { ; GFX10: codeLenInByte = 28 ; GFX1100: codeLenInByte = 32 ; GFX1150: codeLenInByte = 32 -; GFX1250: codeLenInByte = 36 +; GFX1250: codeLenInByte = 24 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; NOT-GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index f4040f3049e0d..eba46a1ecb614 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -256,7 +256,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -350,8 +350,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -455,7 +455,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_3 @@ -529,8 +529,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -676,7 +676,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -704,7 +704,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -751,7 +751,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5] +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe @@ -772,8 +772,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -805,7 +805,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -856,7 +856,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5] +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe @@ -879,7 +879,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_3 @@ -904,7 +904,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; @@ -943,7 +943,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5] +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -959,8 +959,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -989,7 +989,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; @@ -1032,7 +1032,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5] +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1112,7 +1112,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -1131,7 +1131,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2 @@ -1140,9 +1140,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 -; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd -; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -1179,7 +1177,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2 @@ -1189,9 +1187,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v4 -; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo +; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe @@ -1212,8 +1208,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1236,7 +1232,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2 @@ -1245,9 +1241,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 -; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd -; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -1288,7 +1282,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2 @@ -1298,9 +1292,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v4 -; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo +; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe @@ -1323,7 +1315,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_3 @@ -1338,7 +1330,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -1348,9 +1340,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 -; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd -; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; @@ -1378,7 +1368,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -1389,9 +1379,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1407,8 +1395,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -1427,7 +1415,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -1437,9 +1425,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 -; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd -; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; @@ -1471,7 +1457,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 @@ -1482,9 +1468,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1564,7 +1548,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -1662,8 +1646,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1771,7 +1755,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_3 @@ -1853,8 +1837,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -2008,7 +1992,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2106,8 +2090,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2215,7 +2199,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_3 @@ -2297,8 +2281,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -2452,7 +2436,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2550,8 +2534,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2659,7 +2643,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_3 @@ -2741,8 +2725,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -2890,7 +2874,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2992,8 +2976,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3105,7 +3089,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_3 @@ -3187,8 +3171,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -3336,7 +3320,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -3438,8 +3422,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3551,7 +3535,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_3 @@ -3633,8 +3617,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -3782,7 +3766,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -3884,8 +3868,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3997,7 +3981,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_3 @@ -4079,8 +4063,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -4228,7 +4212,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -4330,8 +4314,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -4443,7 +4427,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_3 @@ -4525,8 +4509,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -4695,7 +4679,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -4802,8 +4786,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -4920,7 +4904,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_3 @@ -5010,8 +4994,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -5164,7 +5148,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -5192,10 +5176,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 @@ -5243,10 +5227,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off @@ -5269,8 +5253,8 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5302,10 +5286,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 @@ -5357,10 +5341,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off @@ -5385,7 +5369,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_3 @@ -5408,10 +5392,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm @@ -5449,10 +5433,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off @@ -5470,8 +5454,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -5498,10 +5482,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm @@ -5543,10 +5527,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off @@ -5621,7 +5605,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -5651,7 +5635,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], -1, v[0:1] ; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5703,7 +5687,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, -1 +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], -1, v[0:1] ; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5727,8 +5711,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5762,7 +5746,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], -1, v[0:1] ; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5818,7 +5802,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, -1 +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], -1, v[0:1] ; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5844,7 +5828,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_3 @@ -5869,7 +5853,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1 +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1] ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe ; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -5913,7 +5897,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1 +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1] ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe ; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe @@ -5934,8 +5918,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -5964,7 +5948,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1 +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1] ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe ; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe @@ -6012,7 +5996,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1 +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1] ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe ; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index e6018e413a85d..3f1e354f2ccc7 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -341,7 +341,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %s ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -673,7 +673,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffse ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog @@ -703,7 +703,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3 ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388607 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog @@ -2140,7 +2140,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 4 +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3] ; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2 @@ -2198,7 +2198,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 4 +; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3] ; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index 79907fd0c60bc..fd644a35f61e3 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -304,78 +304,79 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 ; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 ; GCN-SDAG-NEXT: s_clause 0x7 -; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:112 -; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:96 -; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:80 +; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112 +; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96 +; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80 ; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:48 -; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off offset:32 -; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:16 -; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off +; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:32 +; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16 +; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off ; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[16:17], 0x70 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[24:25], 0x70 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 0x60 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 48 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x50 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 32 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 64 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[40:41], 16 -; GCN-SDAG-NEXT: v_dual_mov_b32 v14, 0xc8 :: v_dual_mov_b32 v15, 0 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x50 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[42:43], 0 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 64 +; GCN-SDAG-NEXT: v_dual_mov_b32 v22, 0xc8 :: v_dual_mov_b32 v23, 0 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x7 -; GCN-SDAG-NEXT: global_store_b128 v[16:17], v[6:9], off +; GCN-SDAG-NEXT: global_store_b128 v[24:25], v[10:13], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x6 -; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[10:13], off +; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[18:21], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x5 ; GCN-SDAG-NEXT: s_wait_xcnt 0x1 -; GCN-SDAG-NEXT: v_dual_mov_b32 v16, v20 :: v_dual_mov_b32 v17, v21 +; GCN-SDAG-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v9 ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 -; GCN-SDAG-NEXT: v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13] -; GCN-SDAG-NEXT: v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11] -; GCN-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9] -; GCN-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[6:7], 0, v[6:7] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11] ; GCN-SDAG-NEXT: s_wait_loadcnt 0x4 ; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x3 -; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[30:33], off +; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[14:17], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x2 -; GCN-SDAG-NEXT: global_store_b128 v[40:41], v[22:25], off +; GCN-SDAG-NEXT: global_store_b128 v[40:41], v[26:29], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x1 -; GCN-SDAG-NEXT: global_store_b128 v[42:43], v[26:29], off +; GCN-SDAG-NEXT: global_store_b128 v[42:43], v[30:33], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 ; GCN-SDAG-NEXT: s_wait_xcnt 0x3 -; GCN-SDAG-NEXT: v_lshl_add_u64 v[52:53], v[2:3], 0, v[2:3] -; GCN-SDAG-NEXT: v_lshl_add_u64 v[50:51], v[0:1], 0, v[0:1] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[52:53], v[2:3], v[2:3] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[0:1], v[0:1] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7] +; GCN-SDAG-NEXT: s_wait_xcnt 0x2 +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], 0x64, v[16:17] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15] ; GCN-SDAG-NEXT: s_wait_xcnt 0x1 -; GCN-SDAG-NEXT: v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25] -; GCN-SDAG-NEXT: v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27] ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 -; GCN-SDAG-NEXT: v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29] -; GCN-SDAG-NEXT: v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27] -; GCN-SDAG-NEXT: v_lshl_add_u64 v[36:37], v[36:37], 0, v[36:37] -; GCN-SDAG-NEXT: v_lshl_add_u64 v[34:35], v[34:35], 0, v[34:35] -; GCN-SDAG-NEXT: v_lshl_add_u64 v[32:33], v[32:33], 0, 0x64 -; GCN-SDAG-NEXT: v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31] -; GCN-SDAG-NEXT: v_lshl_add_u64 v[20:21], v[20:21], 0, v[20:21] -; GCN-SDAG-NEXT: v_lshl_add_u64 v[18:19], v[18:19], 0, 0xc8 +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[36:37], v[36:37], v[36:37] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[34:35], v[34:35], v[34:35] ; GCN-SDAG-NEXT: s_clause 0x1 -; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[14:17], off +; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[22:25], off ; GCN-SDAG-NEXT: global_store_b128 v[48:49], v[0:3], off ; GCN-SDAG-NEXT: s_clause 0x7 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:96 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:112 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:96 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:112 ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[50:53], off offset:64 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:80 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off offset:32 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:80 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32 ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[34:37], off offset:48 -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off -; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[22:25], off offset:16 +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off +; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:16 ; GCN-SDAG-NEXT: s_clause 0x3 ; GCN-SDAG-NEXT: scratch_load_b32 v43, off, s32 ; GCN-SDAG-NEXT: scratch_load_b32 v42, off, s32 offset:4 ; GCN-SDAG-NEXT: scratch_load_b32 v41, off, s32 offset:8 ; GCN-SDAG-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; GCN-SDAG-NEXT: s_wait_xcnt 0xc -; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v28 :: v_dual_mov_b32 v1, v29 +; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 ; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] ; @@ -403,11 +404,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: v_mov_b64_e32 v[48:49], 16 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[50:51], 32 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[52:53], 48 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x60 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[54:55], 64 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x70 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[34:35], 0xc8 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[40:41], 0x50 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x60 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x70 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x6 ; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[10:13], off ; GCN-GISEL-NEXT: s_wait_loadcnt 0x5 @@ -422,28 +423,28 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: global_store_b128 v[44:45], v[30:33], off ; GCN-GISEL-NEXT: v_mov_b64_e32 v[36:37], v[8:9] ; GCN-GISEL-NEXT: s_wait_xcnt 0x5 -; GCN-GISEL-NEXT: v_lshl_add_u64 v[10:11], v[10:11], 0, v[10:11] -; GCN-GISEL-NEXT: v_lshl_add_u64 v[12:13], v[12:13], 0, v[12:13] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13] ; GCN-GISEL-NEXT: s_wait_xcnt 0x4 -; GCN-GISEL-NEXT: v_lshl_add_u64 v[14:15], v[14:15], 0, v[14:15] -; GCN-GISEL-NEXT: v_lshl_add_u64 v[16:17], v[16:17], 0, v[16:17] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17] ; GCN-GISEL-NEXT: s_wait_xcnt 0x3 -; GCN-GISEL-NEXT: v_lshl_add_u64 v[18:19], v[18:19], 0, v[18:19] -; GCN-GISEL-NEXT: v_lshl_add_u64 v[20:21], v[20:21], 0, 0x64 +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[20:21], 0x64, v[20:21] ; GCN-GISEL-NEXT: s_wait_xcnt 0x2 -; GCN-GISEL-NEXT: v_lshl_add_u64 v[22:23], v[22:23], 0, v[22:23] -; GCN-GISEL-NEXT: v_lshl_add_u64 v[24:25], v[24:25], 0, v[24:25] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[24:25], v[24:25], v[24:25] ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: v_lshl_add_u64 v[48:49], v[0:1], 0, v[0:1] -; GCN-GISEL-NEXT: v_lshl_add_u64 v[50:51], v[2:3], 0, v[2:3] -; GCN-GISEL-NEXT: v_lshl_add_u64 v[6:7], v[6:7], 0, 0xc8 -; GCN-GISEL-NEXT: v_lshl_add_u64 v[8:9], v[8:9], 0, v[8:9] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9] ; GCN-GISEL-NEXT: s_wait_xcnt 0x1 -; GCN-GISEL-NEXT: v_lshl_add_u64 v[26:27], v[26:27], 0, v[26:27] -; GCN-GISEL-NEXT: v_lshl_add_u64 v[28:29], v[28:29], 0, v[28:29] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29] ; GCN-GISEL-NEXT: s_wait_xcnt 0x0 -; GCN-GISEL-NEXT: v_lshl_add_u64 v[30:31], v[30:31], 0, v[30:31] -; GCN-GISEL-NEXT: v_lshl_add_u64 v[32:33], v[32:33], 0, v[32:33] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33] ; GCN-GISEL-NEXT: s_clause 0x1 ; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[0:3], off ; GCN-GISEL-NEXT: global_store_b128 v[40:41], v[34:37], off diff --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll index df4ff2c8d9851..6706e7638580d 100644 --- a/llvm/test/CodeGen/AMDGPU/literal64.ll +++ b/llvm/test/CodeGen/AMDGPU/literal64.ll @@ -12,21 +12,11 @@ define amdgpu_ps i64 @s_add_u64(i64 inreg %a) { } define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) { -; GCN-SDAG-LABEL: v_add_u64: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xf12345678) -; GCN-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GCN-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off -; GCN-SDAG-NEXT: s_endpgm -; -; GCN-GISEL-LABEL: v_add_u64: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: v_mov_b64_e32 v[4:5], lit64(0xf12345678) -; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5] -; GCN-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off -; GCN-GISEL-NEXT: s_endpgm +; GCN-LABEL: v_add_u64: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0xf12345678), v[0:1] +; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off +; GCN-NEXT: s_endpgm %result = add i64 %a, 64729929336 store i64 %result, ptr addrspace(1) %out, align 8 ret void @@ -42,21 +32,11 @@ define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) { } define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) { -; GCN-SDAG-LABEL: v_add_neg_u64: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xfffffff0edcba988) -; GCN-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GCN-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off -; GCN-SDAG-NEXT: s_endpgm -; -; GCN-GISEL-LABEL: v_add_neg_u64: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: v_mov_b64_e32 v[4:5], lit64(0xfffffff0edcba988) -; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5] -; GCN-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off -; GCN-GISEL-NEXT: s_endpgm +; GCN-LABEL: v_add_neg_u64: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0xfffffff0edcba988), v[0:1] +; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off +; GCN-NEXT: s_endpgm %result = sub i64 %a, 64729929336 store i64 %result, ptr addrspace(1) %out, align 8 ret void @@ -74,9 +54,7 @@ define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) { define amdgpu_ps void @v_sub_u64(i64 %a, ptr addrspace(1) %out) { ; GCN-LABEL: v_sub_u64: ; GCN: ; %bb.0: -; GCN-NEXT: v_sub_co_u32 v0, vcc_lo, 0x12345678, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_sub_co_ci_u32_e64 v1, null, 15, v1, vcc_lo +; GCN-NEXT: v_sub_nc_u64_e32 v[0:1], lit64(0xf12345678), v[0:1] ; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off ; GCN-NEXT: s_endpgm %result = sub i64 64729929336, %a diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 91b3a85d36114..8d3716ef62f7c 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -5,6 +5,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1250 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s ; mul24 and mad24 are affected @@ -124,6 +125,25 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: test_mul_v2i32: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s9, s3 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: test_mul_v2i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -286,6 +306,29 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: v_mul_v4i32: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s9, s3 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null +; GFX1250-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mul_lo_u32 v3, v3, v7 +; GFX1250-NEXT: v_mul_lo_u32 v2, v2, v6 +; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v5 +; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: v_mul_v4i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] @@ -402,6 +445,19 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: s_trunc_i64_mul_to_i32: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mul_i32 s2, s3, s2 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: s_trunc_i64_mul_to_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -555,6 +611,29 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: v_trunc_i64_mul_to_i32: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_mov_b32 s10, -1 +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s10 +; GFX1250-NEXT: s_mov_b32 s15, s11 +; GFX1250-NEXT: s_mov_b32 s6, s10 +; GFX1250-NEXT: s_mov_b32 s7, s11 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s12, s2 +; GFX1250-NEXT: s_mov_b32 s13, s3 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null +; GFX1250-NEXT: buffer_load_b32 v1, off, s[4:7], null +; GFX1250-NEXT: s_mov_b32 s8, s0 +; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: v_trunc_i64_mul_to_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -670,6 +749,19 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: mul64_sext_c: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_ashr_i32 s3, s2, 31 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: mul64_sext_c: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -773,6 +865,18 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: mul64_zext_c: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: mul64_zext_c: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -909,6 +1013,26 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: v_mul64_sext_c: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s9, s3 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mul_u64_e32 v[0:1], 0x50, v[0:1] +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: v_mul64_sext_c: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1052,6 +1176,25 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: v_mul64_zext_c: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s9, s3 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mul_u64_e32 v[0:1], 0x50, v[0:1] +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: v_mul64_zext_c: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1192,6 +1335,26 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: v_mul64_sext_inline_imm: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s9, s3 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mul_u64_e32 v[0:1], 9, v[0:1] +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: v_mul64_sext_inline_imm: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1300,6 +1463,20 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: s_mul_i32: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x2 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x70 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mul_i32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -1425,6 +1602,24 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: v_mul_i32: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s9, s3 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1540,6 +1735,22 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: s_mul_i1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x2 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x70 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_and_b32 s2, s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: buffer_store_b8 v0, off, s[0:3], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i1: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @10, KC0[], KC1[] @@ -1699,6 +1910,28 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: v_mul_i1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s9, s3 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: buffer_load_u8 v0, off, s[8:11], null +; GFX1250-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: buffer_store_b8 v0, off, s[4:7], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i1: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] @@ -1856,6 +2089,19 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: s_mul_i64: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5] +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i64: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] @@ -2044,6 +2290,29 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: v_mul_i64: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_mov_b32 s10, -1 +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s10 +; GFX1250-NEXT: s_mov_b32 s15, s11 +; GFX1250-NEXT: s_mov_b32 s6, s10 +; GFX1250-NEXT: s_mov_b32 s7, s11 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s12, s2 +; GFX1250-NEXT: s_mov_b32 s13, s3 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null +; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[4:7], null +; GFX1250-NEXT: s_mov_b32 s8, s0 +; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i64: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -2286,6 +2555,41 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: mul32_in_branch: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX1250-NEXT: s_mov_b32 s6, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-NEXT: s_cbranch_scc0 .LBB15_2 +; GFX1250-NEXT: ; %bb.1: ; %else +; GFX1250-NEXT: s_mul_i32 s7, s0, s1 +; GFX1250-NEXT: s_branch .LBB15_3 +; GFX1250-NEXT: .LBB15_2: +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: ; implicit-def: $sgpr7 +; GFX1250-NEXT: .LBB15_3: ; %Flow +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX1250-NEXT: s_cbranch_vccnz .LBB15_5 +; GFX1250-NEXT: ; %bb.4: ; %if +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, s2 +; GFX1250-NEXT: s_mov_b32 s5, s3 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null +; GFX1250-NEXT: s_branch .LBB15_6 +; GFX1250-NEXT: .LBB15_5: +; GFX1250-NEXT: v_mov_b32_e32 v0, s7 +; GFX1250-NEXT: .LBB15_6: ; %endif +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: mul32_in_branch: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[] @@ -2539,6 +2843,34 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: mul64_in_branch: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1250-NEXT: s_cbranch_scc0 .LBB16_3 +; GFX1250-NEXT: ; %bb.1: ; %else +; GFX1250-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7] +; GFX1250-NEXT: s_cbranch_execnz .LBB16_4 +; GFX1250-NEXT: .LBB16_2: ; %if +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s4, s2 +; GFX1250-NEXT: s_mov_b32 s5, s3 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null +; GFX1250-NEXT: s_branch .LBB16_5 +; GFX1250-NEXT: .LBB16_3: +; GFX1250-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX1250-NEXT: s_branch .LBB16_2 +; GFX1250-NEXT: .LBB16_4: +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX1250-NEXT: .LBB16_5: ; %endif +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: mul64_in_branch: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[] @@ -2882,6 +3214,52 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: s_mul_i128: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x2 +; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x7c +; GFX1250-NEXT: s_load_b128 s[12:15], s[4:5], 0x4c +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b64 s[4:5], lit64(0xffffffff) +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s17, s3 +; GFX1250-NEXT: s_mov_b32 s19, s3 +; GFX1250-NEXT: s_mov_b32 s20, s3 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s8 +; GFX1250-NEXT: s_and_b64 s[4:5], s[12:13], s[4:5] +; GFX1250-NEXT: s_mov_b32 s6, s13 +; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[12:13] +; GFX1250-NEXT: s_mul_u64 s[12:13], s[4:5], s[2:3] +; GFX1250-NEXT: s_mov_b32 s16, s9 +; GFX1250-NEXT: s_mul_u64 s[8:9], s[8:9], s[14:15] +; GFX1250-NEXT: s_mul_u64 s[14:15], s[6:7], s[2:3] +; GFX1250-NEXT: s_mov_b32 s2, s13 +; GFX1250-NEXT: s_mul_u64 s[4:5], s[4:5], s[16:17] +; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[14:15], s[2:3] +; GFX1250-NEXT: s_mul_u64 s[6:7], s[6:7], s[16:17] +; GFX1250-NEXT: s_mov_b32 s2, s15 +; GFX1250-NEXT: s_mov_b32 s15, s3 +; GFX1250-NEXT: s_mov_b32 s13, s3 +; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[14:15] +; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[10:11], s[8:9] +; GFX1250-NEXT: s_mov_b32 s18, s5 +; GFX1250-NEXT: s_mov_b32 s21, s4 +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] +; GFX1250-NEXT: s_or_b64 s[4:5], s[12:13], s[20:21] +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[6:7], s[2:3] +; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[8:9] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i128: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[] @@ -3159,6 +3537,43 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3] ; GFX12-NEXT: s_endpgm ; +; GFX1250-LABEL: v_mul_i128: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX1250-NEXT: v_and_b32_e32 v16, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_load_b128 v[0:3], v16, s[2:3] scale_offset +; GFX1250-NEXT: global_load_b128 v[4:7], v16, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v10, v0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v8, v4 +; GFX1250-NEXT: v_mul_u64_e32 v[6:7], v[0:1], v[6:7] +; GFX1250-NEXT: v_mul_lo_u32 v3, v3, v4 +; GFX1250-NEXT: v_mul_u64_e32 v[8:9], v[8:9], v[10:11] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], null, v2, v4, v[6:7] +; GFX1250-NEXT: v_mul_lo_u32 v2, v2, v5 +; GFX1250-NEXT: v_mov_b32_e32 v10, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], null, v5, v0, v[10:11] +; GFX1250-NEXT: v_add3_u32 v7, v3, v7, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v10, v13 :: v_dual_mov_b32 v13, v11 +; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], null, v4, v1, v[12:13] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v9, v12 +; GFX1250-NEXT: v_mov_b32_e32 v14, v13 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[14:15] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v1, v[10:11] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], v[0:1], v[6:7] +; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[2:3] scale_offset +; GFX1250-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i128: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] @@ -3271,6 +3686,13 @@ define i32 @mul_pow2_plus_1(i32 %val) { ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; +; GFX1250-LABEL: mul_pow2_plus_1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 3, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; ; EG-LABEL: mul_pow2_plus_1: ; EG: ; %bb.0: ; EG-NEXT: CF_END diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll index 64392a15e9a9b..192dce369b0ef 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -369,7 +369,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; SDAG-NEXT: s_wait_loadcnt 0x0 -; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; SDAG-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; SDAG-NEXT: s_wait_xcnt 0x0 ; SDAG-NEXT: s_wait_alu 0xfffe @@ -418,7 +418,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GISEL-NEXT: s_wait_loadcnt 0x0 -; GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off ; GISEL-NEXT: s_wait_xcnt 0x0 ; GISEL-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/sub_u64.ll b/llvm/test/CodeGen/AMDGPU/sub_u64.ll new file mode 100644 index 0000000000000..baaca4ddeaf05 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sub_u64.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s + +define amdgpu_ps <2 x float> @test_sub_u64_vv(i64 %a, i64 %b) { +; GFX12-LABEL: test_sub_u64_vv: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_sub_u64_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: ; return to shader part epilog + %sub = sub i64 %a, %b + %ret = bitcast i64 %sub to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_sub_u64_vs(i64 %a, i64 inreg %b) { +; GFX12-LABEL: test_sub_u64_vs: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_subrev_co_ci_u32_e64 v1, null, s1, v1, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_sub_u64_vs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_sub_nc_u64_e64 v[0:1], v[0:1], s[0:1] +; GFX1250-NEXT: ; return to shader part epilog + %sub = sub i64 %a, %b + %ret = bitcast i64 %sub to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_sub_u64_sv(i64 inreg %a, i64 %b) { +; GFX12-LABEL: test_sub_u64_sv: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, s1, v1, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_sub_u64_sv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], s[0:1], v[0:1] +; GFX1250-NEXT: ; return to shader part epilog + %sub = sub i64 %a, %b + %ret = bitcast i64 %sub to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_sub_u64_ss(i64 inreg %a, i64 inreg %b) { +; GCN-LABEL: test_sub_u64_ss: +; GCN: ; %bb.0: +; GCN-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog + %sub = sub i64 %a, %b + %ret = bitcast i64 %sub to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_sub_u64_inline_lit_v(i64 %a) { +; GFX12-LABEL: test_sub_u64_inline_lit_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, 5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_sub_u64_inline_lit_v: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], 5, v[0:1] +; GFX1250-NEXT: ; return to shader part epilog + %sub = sub i64 5, %a + %ret = bitcast i64 %sub to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_sub_u64_v_inline_lit(i64 %a) { +; GFX12-LABEL: test_sub_u64_v_inline_lit: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, -5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_sub_u64_v_inline_lit: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], -5, v[0:1] +; GFX1250-NEXT: ; return to shader part epilog + %sub = sub i64 %a, 5 + %ret = bitcast i64 %sub to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_sub_u64_small_imm_v(i64 %a) { +; GFX12-LABEL: test_sub_u64_small_imm_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, 0x1f4, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_sub_u64_small_imm_v: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], 0x1f4, v[0:1] +; GFX1250-NEXT: ; return to shader part epilog + %sub = sub i64 500, %a + %ret = bitcast i64 %sub to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_sub_u64_64bit_imm_v(i64 %a) { +; GFX12-LABEL: test_sub_u64_64bit_imm_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, 0x3b9ac9ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, 1, v1, vcc_lo +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_sub_u64_64bit_imm_v: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], lit64(0x13b9ac9ff), v[0:1] +; GFX1250-NEXT: ; return to shader part epilog + %sub = sub i64 5294967295, %a + %ret = bitcast i64 %sub to <2 x float> + ret <2 x float> %ret +} + +define amdgpu_ps <2 x float> @test_sub_u64_small_imm_s(i64 inreg %a) { +; GCN-LABEL: test_sub_u64_small_imm_s: +; GCN: ; %bb.0: +; GCN-NEXT: s_sub_nc_u64 s[0:1], 0x1f4, s[0:1] +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog + %sub = sub i64 500, %a + %ret = bitcast i64 %sub to <2 x float> + ret <2 x float> %ret +} diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s index 20bc578605b8c..0a1d3bfc02503 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s @@ -154,6 +154,362 @@ v_fmac_f64 v[4:5], v[2:3], v[8:9] div:2 // GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x18] // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_add_nc_u64 v[4:5], v[2:3], v[4:5] +// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[254:255], v[2:3], v[4:5] +// GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x51] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5] +// GFX1250: v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[254:255], v[4:5] +// GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], vcc, v[4:5] +// GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], exec, v[4:5] +// GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], 0, v[4:5] +// GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], -1, v[4:5] +// GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], 0.5, v[4:5] +// GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], -4.0, v[4:5] +// GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], 0xaf123456, v[4:5] +// GFX1250: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], 0x3f717273, v[4:5] +// GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[2:3], v[254:255] +// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[2:3], v[8:9] +// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[254:255], v[2:3], v[8:9] +// GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x51] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[254:255], v[8:9] +// GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], vcc, v[8:9] +// GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], exec, v[8:9] +// GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], 0, v[8:9] +// GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], -1, v[8:9] +// GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], 0.5, v[8:9] +// GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], -4.0, v[8:9] +// GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[2:3], v[254:255] +// GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[2:3], vcc +// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[2:3], exec +// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[2:3], 0 +// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[2:3], -1 +// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[2:3], 0.5 +// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[2:3], -4.0 +// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_add_nc_u64 v[4:5], v[2:3], v[8:9] clamp +// GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[2:3], v[4:5] +// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[254:255], v[2:3], v[4:5] +// GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x53] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5] +// GFX1250: v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[254:255], v[4:5] +// GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], vcc, v[4:5] +// GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], exec, v[4:5] +// GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], 0, v[4:5] +// GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], -1, v[4:5] +// GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], 0.5, v[4:5] +// GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], -4.0, v[4:5] +// GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], 0xaf123456, v[4:5] +// GFX1250: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], 0x3f717273, v[4:5] +// GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[2:3], v[254:255] +// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[2:3], v[8:9] +// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[254:255], v[2:3], v[8:9] +// GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x53] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[254:255], v[8:9] +// GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], vcc, v[8:9] +// GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], exec, v[8:9] +// GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], 0, v[8:9] +// GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], -1, v[8:9] +// GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], 0.5, v[8:9] +// GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], -4.0, v[8:9] +// GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[2:3], v[254:255] +// GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[2:3], vcc +// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[2:3], exec +// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[2:3], 0 +// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[2:3], -1 +// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[2:3], 0.5 +// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[2:3], -4.0 +// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sub_nc_u64 v[4:5], v[2:3], v[8:9] clamp +// GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[2:3], v[4:5] +// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[254:255], v[2:3], v[4:5] +// GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x55] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64_e64 v[4:5], s[2:3], s[4:5] +// GFX1250: v_mul_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[254:255], v[4:5] +// GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], vcc, v[4:5] +// GFX1250: v_mul_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], exec, v[4:5] +// GFX1250: v_mul_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], 0, v[4:5] +// GFX1250: v_mul_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], -1, v[4:5] +// GFX1250: v_mul_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], 0.5, v[4:5] +// GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], -4.0, v[4:5] +// GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], 0xaf123456, v[4:5] +// GFX1250: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], 0x3f717273, v[4:5] +// GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[2:3], v[254:255] +// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[2:3], v[8:9] +// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[254:255], v[2:3], v[8:9] +// GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x55] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[254:255], v[8:9] +// GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], vcc, v[8:9] +// GFX1250: v_mul_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], exec, v[8:9] +// GFX1250: v_mul_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], 0, v[8:9] +// GFX1250: v_mul_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], -1, v[8:9] +// GFX1250: v_mul_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], 0.5, v[8:9] +// GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], -4.0, v[8:9] +// GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[2:3], v[254:255] +// GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[2:3], vcc +// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[2:3], exec +// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[2:3], 0 +// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[2:3], -1 +// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[2:3], 0.5 +// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_mul_u64 v[4:5], v[2:3], -4.0 +// GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00] +// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3] // GFX1250: v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3] ; encoding: [0x04,0x05,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40] // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s index f67ad88b5ae83..9f5036106dbd3 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2_err.s @@ -22,3 +22,8 @@ v_fmamk_f16 v4, v2, 3, v6 row_share:1 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. // GFX1250-ERR-NEXT:{{^}}v_fmamk_f16 v4, v2, 3, v6 row_share:1 // GFX1250-ERR-NEXT:{{^}} ^ + +v_mul_u64 v[4:5], v[2:3], v[8:9] clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_mul_u64 v[4:5], v[2:3], v[8:9] clamp +// GFX1250-ERR-NEXT:{{^}} ^ diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt index c1213f2d9ec0d..130941c8c1397 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt @@ -112,6 +112,264 @@ 0x04,0x03,0x17,0xd5,0x02,0x11,0x02,0x00 # GFX1250: v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]| ; encoding: [0x04,0x03,0x17,0xd5,0x02,0x11,0x02,0x00] +0x02,0x09,0xfc,0x51 +# GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x51] + +0x02,0x11,0xfc,0x51 +# GFX1250: v_add_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x51] + +0xc1,0x08,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x50] + +0xc1,0x10,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x50] + +0xf7,0x08,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x50] + +0xf7,0x10,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x50] + +0x80,0x08,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x50] + +0x80,0x10,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x50] + +0xf0,0x08,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x50] + +0xf0,0x10,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x50] + +0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f +# GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f] + +0xff,0x08,0x08,0x50,0x56,0x34,0x12,0xaf +# GFX1250: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] + +0x7e,0x08,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x50] + +0x7e,0x10,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x50] + +0xfe,0x09,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x50] + +0xfe,0x11,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x50] + +0x02,0xfd,0x09,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x50] + +0x02,0x09,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x50] + +0x02,0x11,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x50] + +0x6a,0x08,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x50] + +0x6a,0x10,0x08,0x50 +# GFX1250: v_add_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x50] + +0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00 +# GFX1250: v_add_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x08,0x00,0x00] + +0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00 +# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x83,0x01,0x00] + +0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00 +# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xef,0x01,0x00] + +0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00 +# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0x01,0x01,0x00] + +0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00 +# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xe1,0x01,0x00] + +0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00 +# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xfd,0x00,0x00] + +0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00 +# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x28,0xd5,0x02,0x11,0x02,0x00] + +0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00 +# GFX1250: v_add_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x28,0xd5,0x02,0xd5,0x00,0x00] + +0x02,0x09,0xfc,0x53 +# GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x53] + +0x02,0x11,0xfc,0x53 +# GFX1250: v_sub_nc_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x53] + +0xc1,0x08,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x52] + +0xc1,0x10,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x52] + +0xf7,0x08,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x52] + +0xf7,0x10,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x52] + +0x80,0x08,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x52] + +0x80,0x10,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x52] + +0xf0,0x08,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x52] + +0xf0,0x10,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x52] + +0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f +# GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f] + +0xff,0x08,0x08,0x52,0x56,0x34,0x12,0xaf +# GFX1250: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] + +0x7e,0x08,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x52] + +0x7e,0x10,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x52] + +0xfe,0x09,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x52] + +0xfe,0x11,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x52] + +0x02,0xfd,0x09,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x52] + +0x02,0x09,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x52] + +0x02,0x11,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x52] + +0x6a,0x08,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x52] + +0x6a,0x10,0x08,0x52 +# GFX1250: v_sub_nc_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x52] + +0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00 +# GFX1250: v_sub_nc_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x08,0x00,0x00] + +0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00 +# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x83,0x01,0x00] + +0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00 +# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xef,0x01,0x00] + +0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00 +# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0x01,0x01,0x00] + +0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00 +# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xe1,0x01,0x00] + +0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00 +# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xfd,0x00,0x00] + +0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00 +# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x29,0xd5,0x02,0x11,0x02,0x00] + +0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00 +# GFX1250: v_sub_nc_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x29,0xd5,0x02,0xd5,0x00,0x00] + +0x02,0x09,0xfc,0x55 +# GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x55] + +0x02,0x11,0xfc,0x55 +# GFX1250: v_mul_u64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x55] + +0xc1,0x08,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x54] + +0xc1,0x10,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], -1, v[8:9] ; encoding: [0xc1,0x10,0x08,0x54] + +0xf7,0x08,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x54] + +0xf7,0x10,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], -4.0, v[8:9] ; encoding: [0xf7,0x10,0x08,0x54] + +0x80,0x08,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x54] + +0x80,0x10,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], 0, v[8:9] ; encoding: [0x80,0x10,0x08,0x54] + +0xf0,0x08,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x54] + +0xf0,0x10,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], 0.5, v[8:9] ; encoding: [0xf0,0x10,0x08,0x54] + +0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f +# GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f] + +0xff,0x08,0x08,0x54,0x56,0x34,0x12,0xaf +# GFX1250: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] + +0x7e,0x08,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x54] + +0x7e,0x10,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], exec, v[8:9] ; encoding: [0x7e,0x10,0x08,0x54] + +0xfe,0x09,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x54] + +0xfe,0x11,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x54] + +0x02,0xfd,0x09,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x54] + +0x02,0x09,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x54] + +0x02,0x11,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], v[2:3], v[8:9] ; encoding: [0x02,0x11,0x08,0x54] + +0x6a,0x08,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x54] + +0x6a,0x10,0x08,0x54 +# GFX1250: v_mul_u64_e32 v[4:5], vcc, v[8:9] ; encoding: [0x6a,0x10,0x08,0x54] + +0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00 +# GFX1250: v_mul_u64_e64 v[4:5], s[2:3], s[4:5] ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x08,0x00,0x00] + +0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00 +# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x83,0x01,0x00] + +0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00 +# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xef,0x01,0x00] + +0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00 +# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0x01,0x01,0x00] + +0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00 +# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xe1,0x01,0x00] + +0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00 +# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xfd,0x00,0x00] + +0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00 +# GFX1250: v_mul_u64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x2a,0xd5,0x02,0xd5,0x00,0x00] + 0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40 # GFX1250: v_fmaak_f64 v[254:255], 0x405ec000, v[254:255], 0x405ec000 ; encoding: [0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]