diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index d8c4cbbc4fa33..c690b2b7129b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2293,16 +2293,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture( assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE || !ST.hasGloballyAddressableScratch()) && "Cannot use src_private_base with globally addressable scratch!"); - // FIXME: It would be more natural to emit a COPY here, but then copy - // coalescing would kick in and it would think it's okay to use the "HI" - // subregister (instead of extracting the HI 32 bits) which is an artificial - // (unusable) register. - // Register TableGen definitions would need an overhaul to get rid of the - // artificial "HI" aperture registers and prevent this kind of issue from - // happening. Register Dst = MRI.createGenericVirtualRegister(S64); MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); - B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); + B.buildCopy({Dst}, {Register(ApertureRegNo)}); return B.buildUnmerge(S32, Dst).getReg(1); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6a4df5eeb9779..1982295dde909 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8159,25 +8159,14 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, // it returns a wrong value (all zeroes?). The real value is in the upper 32 // bits. // - // To work around the issue, directly emit a 64 bit mov from this register + // To work around the issue, emit a 64 bit copy from this register // then extract the high bits. Note that this shouldn't even result in a // shift being emitted and simply become a pair of registers (e.g.): // s_mov_b64 s[6:7], src_shared_base // v_mov_b32_e32 v1, s7 - // - // FIXME: It would be more natural to emit a CopyFromReg here, but then copy - // coalescing would kick in and it would think it's okay to use the "HI" - // subregister directly (instead of extracting the HI 32 bits) which is an - // artificial (unusable) register. - // Register TableGen definitions would need an overhaul to get rid of the - // artificial "HI" aperture registers and prevent this kind of issue from - // happening. - SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, - DAG.getRegister(ApertureRegNo, MVT::i64)); - return DAG.getNode( - ISD::TRUNCATE, DL, MVT::i32, - DAG.getNode(ISD::SRL, DL, MVT::i64, - {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)})); + SDValue Copy = + DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32); + return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1); } // For code object version 5, private_base and shared_base are passed through diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5106478a95b43..7b77ca22cddc5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -913,7 +913,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { + if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) { reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); return; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 31dd6b9e8d84d..7eccaafefc893 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -228,16 +228,12 @@ def SGPR_NULL64 : // need them, we need to do a 64 bit load and extract the bits manually. multiclass ApertureRegister regIdx> { let isConstant = true in { - // FIXME: We shouldn't need to define subregisters for these (nor add them to any 16 bit - // register classes), but if we don't it seems to confuse the TableGen - // backend and we end up with a lot of weird register pressure sets and classes. defm _LO : SIRegLoHi16 ; - defm _HI : SIRegLoHi16 <"", regIdx>; - - def "" : RegisterWithSubRegs(NAME#_LO), !cast(NAME#_HI)]> { + def "" : RegisterWithSubRegs(NAME#_LO)]> { let Namespace = "AMDGPU"; - let SubRegIndices = [sub0, sub1]; + let SubRegIndices = [sub0]; let HWEncoding = !cast(NAME#_LO).HWEncoding; + let CoveredBySubRegs = 0; } } // isConstant = true } @@ -790,8 +786,7 @@ let GeneratePressureSet = 0, HasSGPR = 1 in { def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO, - SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI, - SRC_SHARED_LIMIT_HI, SRC_PRIVATE_BASE_HI, SRC_PRIVATE_LIMIT_HI, SRC_POPS_EXITING_WAVE_ID, + SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_POPS_EXITING_WAVE_ID, SRC_VCCZ, SRC_EXECZ, SRC_SCC, SRC_FLAT_SCRATCH_BASE_LO, SRC_FLAT_SCRATCH_BASE_HI)> { let AllocationPriority = 0; } @@ -801,10 +796,9 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16, TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16, SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16, - SRC_SHARED_BASE_HI_LO16, SRC_SHARED_LIMIT_HI_LO16, SRC_PRIVATE_BASE_HI_LO16, - SRC_PRIVATE_LIMIT_HI_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, - SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16, - SRC_FLAT_SCRATCH_BASE_LO_LO16, SRC_FLAT_SCRATCH_BASE_HI_LO16)> { + SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16, + EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16, SRC_FLAT_SCRATCH_BASE_LO_LO16, + SRC_FLAT_SCRATCH_BASE_HI_LO16)> { let Size = 16; let isAllocatable = 0; let BaseClassOrder = 16; @@ -825,6 +819,13 @@ def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2 let AllocationPriority = 0; } +def APERTURE_Class : SIRegisterClass<"AMDGPU", Reg64Types.types, 32, + (add SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> { + let isAllocatable = 0; + let Size = 64; + let BaseClassOrder = 10000; +} + } // End GeneratePressureSet = 0 // Register class for all scalar registers (SGPRs + Special Registers) @@ -876,8 +877,7 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16], } def SReg_64_XEXEC_XNULL : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32, - (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SRC_SHARED_BASE, - SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA, + (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA, SRC_FLAT_SCRATCH_BASE)> { let CopyCost = 1; let AllocationPriority = 1; @@ -900,6 +900,14 @@ def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f1 let Size = 64; } +def SReg_64_Encodable : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32, + (add SReg_64, APERTURE_Class)> { + let CopyCost = 1; + let isAllocatable = 0; + let HasSGPR = 1; + let Size = 64; +} + def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32, (add SReg_64_XEXEC, SReg_32_XEXEC)> { let CopyCost = 1; @@ -1225,7 +1233,7 @@ def SSrc_bf16: SrcRegOrImm9 ; def SSrc_f16 : SrcRegOrImm9 ; def SSrc_b32 : SrcRegOrImm9 ; def SSrc_f32 : SrcRegOrImm9 ; -def SSrc_b64 : SrcRegOrImm9 ; +def SSrc_b64 : SrcRegOrImm9 ; def SSrcOrLds_b32 : SrcRegOrImm9 ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll index b520ce1826ec9..3d224f2f6bf05 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll @@ -9,12 +9,11 @@ define amdgpu_ps void @amdgpu_ps() { ; MESA-LABEL: amdgpu_ps: ; MESA: ; %bb.0: -; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4 -; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; MESA-NEXT: s_mov_b64 s[0:1], src_private_base ; MESA-NEXT: s_mov_b32 s0, 0 -; MESA-NEXT: s_mov_b64 s[2:3], src_private_base -; MESA-NEXT: s_mov_b32 s1, s3 +; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4 ; MESA-NEXT: v_mov_b32_e32 v0, s0 +; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; MESA-NEXT: v_mov_b32_e32 v2, 0 ; MESA-NEXT: v_mov_b32_e32 v1, s1 ; MESA-NEXT: flat_store_dword v[0:1], v2 @@ -30,11 +29,10 @@ define amdgpu_ps void @amdgpu_ps() { ; PAL-NEXT: s_waitcnt lgkmcnt(0) ; PAL-NEXT: s_and_b32 s3, s3, 0xffff ; PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 -; PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; PAL-NEXT: s_mov_b64 s[0:1], src_private_base ; PAL-NEXT: s_mov_b32 s0, 0 -; PAL-NEXT: s_mov_b64 s[2:3], src_private_base -; PAL-NEXT: s_mov_b32 s1, s3 ; PAL-NEXT: v_mov_b32_e32 v0, s0 +; PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; PAL-NEXT: v_mov_b32_e32 v1, s1 ; PAL-NEXT: flat_store_dword v[0:1], v2 ; PAL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 86766e2904619..9539ec465e02f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -65,52 +65,52 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9V4-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_mov_b32 s2, s0 -; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V4-NEXT: s_mov_b32 s0, s4 +; GFX9V4-NEXT: s_cmp_lg_u32 s4, -1 +; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX9V4-NEXT: s_mov_b32 s2, s5 +; GFX9V4-NEXT: s_cmp_lg_u32 s5, -1 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9V4-NEXT: s_mov_b32 s4, s1 -; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 -; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 -; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 ; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 -; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s1 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) -; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V4-NEXT: v_mov_b32_e32 v2, 2 -; GFX9V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) ; GFX9V4-NEXT: s_endpgm ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V5-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9V5-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_mov_b32 s2, s0 -; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V5-NEXT: s_mov_b32 s0, s4 +; GFX9V5-NEXT: s_cmp_lg_u32 s4, -1 +; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX9V5-NEXT: s_mov_b32 s2, s5 +; GFX9V5-NEXT: s_cmp_lg_u32 s5, -1 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9V5-NEXT: s_mov_b32 s4, s1 -; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 -; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 -; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 ; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 -; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s1 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) -; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V5-NEXT: v_mov_b32_e32 v2, 2 -; GFX9V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: s_endpgm @@ -150,10 +150,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V4-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX9V4-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX9V4-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 +; GFX9V4-NEXT: s_cmp_eq_u32 s3, s1 ; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -162,10 +162,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V5-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX9V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX9V5-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 +; GFX9V5-NEXT: s_cmp_eq_u32 s3, s1 ; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off @@ -206,10 +206,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9V4-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3 +; GFX9V4-NEXT: s_cmp_eq_u32 s3, s1 ; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -218,10 +218,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3 +; GFX9V5-NEXT: s_cmp_eq_u32 s3, s1 ; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir index d69a3e1a15bbd..4471980c1ba1c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir @@ -158,8 +158,8 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5) - ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_private_base + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1 ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 @@ -227,8 +227,8 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3) - ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 @@ -380,16 +380,16 @@ body: | ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) - ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV3]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]] ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) - ; GFX9-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_1]](s64) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base + ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[UV5]](s32) ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]] ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C1]] @@ -517,8 +517,8 @@ body: | ; GFX9-LABEL: name: test_addrspacecast_p5_fi_to_p0 ; GFX9: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5) - ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s64) = COPY $src_private_base + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p0) %0:_(p5) = G_FRAME_INDEX %stack.0 diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll index 58f3ffb0492e0..bc341f2baa804 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll @@ -361,8 +361,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__av(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] @@ -417,8 +417,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__v(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] @@ -473,8 +473,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__a(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] @@ -538,13 +538,13 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 { ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 @@ -603,13 +603,13 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__v(ptr %ptr) #0 { ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 +; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -659,12 +659,12 @@ define void @flat_atomic_cmpxchg_i64_ret_v_a__v(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; CHECK-NEXT: ;;#ASMSTART @@ -717,12 +717,12 @@ define void @flat_atomic_cmpxchg_i64_ret_a_v__v(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; CHECK-NEXT: ;;#ASMSTART @@ -775,8 +775,8 @@ define void @flat_atomic_cmpxchg_i64_ret_v_v__a(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] @@ -836,8 +836,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_v__av(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] @@ -892,8 +892,8 @@ define void @flat_atomic_cmpxchg_i64_ret_v_av__av(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[2:3] @@ -948,12 +948,12 @@ define void @flat_atomic_cmpxchg_i64_ret_av_a__av(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; CHECK-NEXT: ;;#ASMSTART @@ -1006,12 +1006,12 @@ define void @flat_atomic_cmpxchg_i64_ret_a_av__av(ptr %ptr) #0 { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; CHECK-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index 6b6eb43baf856..d053425afbb6d 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -641,12 +641,12 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -686,15 +686,15 @@ define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[2:3] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -739,12 +739,12 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -782,15 +782,15 @@ define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_v: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -833,8 +833,8 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] @@ -876,9 +876,9 @@ define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -927,8 +927,8 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] @@ -968,9 +968,9 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[4:5] @@ -1017,8 +1017,8 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] @@ -1058,9 +1058,9 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_v: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[4:5] @@ -1107,8 +1107,8 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] @@ -1150,9 +1150,9 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -1201,12 +1201,12 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1244,15 +1244,15 @@ define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -1295,8 +1295,8 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] @@ -1336,9 +1336,9 @@ define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[4:5] @@ -1383,11 +1383,11 @@ define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i64_noret_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2487,11 +2487,11 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -2549,13 +2549,13 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -2616,11 +2616,11 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_v: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -2674,13 +2674,13 @@ define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_v: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -3221,11 +3221,11 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -3279,13 +3279,13 @@ define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -3458,11 +3458,11 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_expansion_i64_noret_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3516,13 +3516,13 @@ define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_expansion_i64_noret_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB40_3 @@ -4304,11 +4304,11 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -4350,13 +4350,13 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -4403,11 +4403,11 @@ define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_v: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -4445,13 +4445,13 @@ define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_i64_ret_a_v: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -4858,11 +4858,11 @@ define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -4900,13 +4900,13 @@ define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_i64_ret_a_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -5035,11 +5035,11 @@ define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_i64_noret_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -5077,13 +5077,13 @@ define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_i64_noret_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB61_3 @@ -6306,12 +6306,12 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -6352,15 +6352,15 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_add_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -6404,8 +6404,8 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] @@ -6443,9 +6443,9 @@ define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_add_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[4:5] @@ -6489,12 +6489,12 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -6535,15 +6535,15 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_sub_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -6589,8 +6589,8 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] @@ -6628,9 +6628,9 @@ define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_sub_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -6676,12 +6676,12 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -6722,15 +6722,15 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_and_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -6775,8 +6775,8 @@ define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] @@ -6814,9 +6814,9 @@ define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_and_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -6861,12 +6861,12 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -6925,15 +6925,15 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_nand_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -6996,8 +6996,8 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] @@ -7054,9 +7054,9 @@ define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_nand_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:1] @@ -7119,12 +7119,12 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -7165,15 +7165,15 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_or_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -7218,8 +7218,8 @@ define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] @@ -7257,9 +7257,9 @@ define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_or_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -7304,12 +7304,12 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -7351,15 +7351,15 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_max_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -7406,8 +7406,8 @@ define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] @@ -7446,9 +7446,9 @@ define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_max_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -7495,12 +7495,12 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -7542,15 +7542,15 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_min_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -7597,8 +7597,8 @@ define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] @@ -7637,9 +7637,9 @@ define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_min_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -7686,12 +7686,12 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -7733,15 +7733,15 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_umax_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -7788,8 +7788,8 @@ define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] @@ -7828,9 +7828,9 @@ define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_umax_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -7877,12 +7877,12 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -7924,15 +7924,15 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_umin_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -7979,8 +7979,8 @@ define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] @@ -8019,9 +8019,9 @@ define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_umin_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -8068,12 +8068,12 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -8117,15 +8117,15 @@ define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -8172,8 +8172,8 @@ define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] @@ -8214,9 +8214,9 @@ define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -8264,12 +8264,12 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -8315,15 +8315,15 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -8372,8 +8372,8 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] @@ -8416,9 +8416,9 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -8467,12 +8467,12 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -8533,15 +8533,15 @@ define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -8609,8 +8609,8 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] @@ -8669,9 +8669,9 @@ define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:1] @@ -8740,12 +8740,12 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -8806,15 +8806,15 @@ define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -8882,8 +8882,8 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] @@ -8942,9 +8942,9 @@ define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:1] @@ -9017,8 +9017,8 @@ define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9100,8 +9100,8 @@ define void @flat_atomic_fadd_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v3 @@ -9818,12 +9818,12 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -9884,15 +9884,15 @@ define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -9958,8 +9958,8 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] @@ -10015,9 +10015,9 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fadd_f64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[4:5] @@ -10081,12 +10081,12 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -10139,15 +10139,15 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fsub_f64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -10204,8 +10204,8 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] @@ -10255,9 +10255,9 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fsub_f64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -10314,12 +10314,12 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -10361,15 +10361,15 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmax_f64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -10415,8 +10415,8 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] @@ -10455,9 +10455,9 @@ define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmax_f64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -10503,12 +10503,12 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -10550,15 +10550,15 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmin_f64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -10604,8 +10604,8 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:5] @@ -10644,9 +10644,9 @@ define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmin_f64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[2:3] @@ -10692,12 +10692,12 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -10758,15 +10758,15 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -10832,8 +10832,8 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] @@ -10891,9 +10891,9 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:1] @@ -10960,12 +10960,12 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -11026,15 +11026,15 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fminimum_f64_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -11100,8 +11100,8 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] @@ -11159,9 +11159,9 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fminimum_f64_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:1] @@ -14340,8 +14340,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -14383,8 +14383,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -14430,8 +14430,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -14467,8 +14467,8 @@ define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -14508,8 +14508,8 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -14555,8 +14555,8 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -14604,8 +14604,8 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -14643,8 +14643,8 @@ define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -14685,8 +14685,8 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -14732,8 +14732,8 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -14783,8 +14783,8 @@ define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -14822,8 +14822,8 @@ define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -14866,8 +14866,8 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -14913,8 +14913,8 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -14963,8 +14963,8 @@ define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -15002,8 +15002,8 @@ define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -15045,8 +15045,8 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -15109,8 +15109,8 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -15176,8 +15176,8 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -15234,8 +15234,8 @@ define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -15295,8 +15295,8 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -15342,8 +15342,8 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -15392,8 +15392,8 @@ define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -15431,8 +15431,8 @@ define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -15474,8 +15474,8 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -15521,8 +15521,8 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -15571,8 +15571,8 @@ define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -15610,8 +15610,8 @@ define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -15653,8 +15653,8 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -15701,8 +15701,8 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -15753,8 +15753,8 @@ define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -15793,8 +15793,8 @@ define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -15838,8 +15838,8 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -15886,8 +15886,8 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -15938,8 +15938,8 @@ define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -15978,8 +15978,8 @@ define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -16023,8 +16023,8 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -16071,8 +16071,8 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -16123,8 +16123,8 @@ define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -16163,8 +16163,8 @@ define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -16208,8 +16208,8 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -16256,8 +16256,8 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -16308,8 +16308,8 @@ define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -16348,8 +16348,8 @@ define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -16393,8 +16393,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -16443,8 +16443,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -16495,8 +16495,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -16537,8 +16537,8 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -16583,8 +16583,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -16635,8 +16635,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -16689,8 +16689,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -16733,8 +16733,8 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -16780,8 +16780,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -16846,8 +16846,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -16918,8 +16918,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -16978,8 +16978,8 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -17045,8 +17045,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -17111,8 +17111,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -17183,8 +17183,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -17243,8 +17243,8 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -17314,8 +17314,8 @@ define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 40 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -17396,8 +17396,8 @@ define void @flat_atomic_fadd_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 40 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -18141,8 +18141,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -18205,8 +18205,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -18272,8 +18272,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -18327,8 +18327,8 @@ define void @flat_atomic_fadd_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -18386,8 +18386,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -18444,8 +18444,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -18505,8 +18505,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -18556,8 +18556,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -18611,8 +18611,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -18659,8 +18659,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -18710,8 +18710,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -18750,8 +18750,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -18794,8 +18794,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -18842,8 +18842,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -18893,8 +18893,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -18933,8 +18933,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -18977,8 +18977,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -19043,8 +19043,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -19113,8 +19113,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -19172,8 +19172,8 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -19237,8 +19237,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] @@ -19303,8 +19303,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] @@ -19373,8 +19373,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 -; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] @@ -19432,8 +19432,8 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index a6a0a9a3c9015..4df82946343b5 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -8,8 +8,8 @@ target triple = "amdgcn-amd-amdhsa" ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 -; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0 -; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 +; CI-DAG: s_cselect_b32 s[[SHI:[0-9]+]], [[APERTURE]], 0 +; CI-DAG: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0 ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base @@ -17,10 +17,13 @@ target triple = "amdgcn-amd-amdhsa" ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} ; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0 -; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0 +; GFX9-DAG: s_cselect_b32 s[[SHI:[0-9]+]], s[[HIBASE]], 0 +; GFX9-DAG: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0 -; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] +; HSA-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; HSA-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] + +; HSA: flat_store_dword v[[[VLO]]:[[VHI]]], [[K]] ; HSA: .amdhsa_user_sgpr_private_segment_buffer 1 ; HSA: .amdhsa_user_sgpr_dispatch_ptr 0 @@ -68,18 +71,21 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 { ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 -; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0 -; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 +; CI-DAG: s_cselect_b32 s[[SHI:[0-9]+]], [[APERTURE]], 0 +; CI-DAG: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0 -; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0 +; GFX9: s_cselect_b32 s[[SHI:[0-9]+]], s[[HIBASE]], 0 +; GFX9: s_cselect_b32 s[[SLO:[0-9]+]], [[PTR]], 0 -; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] +; HSA-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; HSA-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] + +; HSA: flat_store_dword v[[[VLO]]:[[VHI]]], [[K]] ; HSA: .amdhsa_user_sgpr_private_segment_buffer 1 ; HSA: .amdhsa_user_sgpr_dispatch_ptr 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index e4323999d19c3..3c316f4acedb7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -456,12 +456,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: s_mov_b32 s6, 32 -; GFX908-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 ; GFX908-NEXT: s_getpc_b64 s[6:7] ; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 ; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 -; GFX908-NEXT: s_cmp_eq_u32 s7, s4 +; GFX908-NEXT: s_cmp_eq_u32 s7, s5 ; GFX908-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX908-NEXT: s_mov_b64 s[4:5], -1 @@ -507,12 +505,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: s_mov_b32 s6, 32 -; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 ; GFX90A-NEXT: s_getpc_b64 s[6:7] ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 -; GFX90A-NEXT: s_cmp_eq_u32 s7, s4 +; GFX90A-NEXT: s_cmp_eq_u32 s7, s5 ; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX90A-NEXT: s_mov_b64 s[4:5], -1 @@ -558,12 +554,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX942-NEXT: s_mov_b32 s2, 32 -; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX942-NEXT: s_getpc_b64 s[2:3] ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 ; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 -; GFX942-NEXT: s_cmp_eq_u32 s3, s0 +; GFX942-NEXT: s_cmp_eq_u32 s3, s1 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], -1 @@ -607,12 +601,10 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1100-NEXT: s_mov_b32 s2, 32 -; GFX1100-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX1100-NEXT: s_getpc_b64 s[2:3] ; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 ; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 -; GFX1100-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1100-NEXT: s_cmp_eq_u32 s3, s1 ; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1100-NEXT: s_mov_b32 s0, -1 @@ -660,9 +652,6 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1200-NEXT: s_mov_b32 s2, 32 -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX1200-NEXT: s_getpc_b64 s[2:3] ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_sext_i32_i16 s3, s3 @@ -670,7 +659,7 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1200-NEXT: s_cmp_eq_u32 s3, s1 ; GFX1200-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -731,12 +720,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX908-NEXT: s_mov_b32 s6, 32 -; GFX908-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 ; GFX908-NEXT: s_getpc_b64 s[6:7] ; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 ; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 -; GFX908-NEXT: s_cmp_eq_u32 s7, s4 +; GFX908-NEXT: s_cmp_eq_u32 s7, s5 ; GFX908-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX908-NEXT: s_mov_b64 s[4:5], -1 @@ -800,12 +787,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX90A-NEXT: s_mov_b32 s6, 32 -; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 ; GFX90A-NEXT: s_getpc_b64 s[6:7] ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 -; GFX90A-NEXT: s_cmp_eq_u32 s7, s4 +; GFX90A-NEXT: s_cmp_eq_u32 s7, s5 ; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX90A-NEXT: s_mov_b64 s[4:5], -1 @@ -825,12 +810,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: s_branch .LBB5_10 ; GFX90A-NEXT: .LBB5_3: ; %atomicrmw.check.private ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: s_mov_b32 s6, 32 -; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 ; GFX90A-NEXT: s_getpc_b64 s[6:7] ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 -; GFX90A-NEXT: s_cmp_eq_u32 s7, s4 +; GFX90A-NEXT: s_cmp_eq_u32 s7, s5 ; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX90A-NEXT: s_mov_b64 s[4:5], -1 @@ -896,12 +879,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX942-NEXT: s_mov_b32 s2, 32 -; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX942-NEXT: s_getpc_b64 s[2:3] ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 ; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 -; GFX942-NEXT: s_cmp_eq_u32 s3, s0 +; GFX942-NEXT: s_cmp_eq_u32 s3, s1 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], -1 @@ -921,12 +902,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: s_branch .LBB5_10 ; GFX942-NEXT: .LBB5_3: ; %atomicrmw.check.private ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX942-NEXT: s_mov_b32 s2, 32 -; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX942-NEXT: s_getpc_b64 s[2:3] ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 ; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 -; GFX942-NEXT: s_cmp_eq_u32 s3, s0 +; GFX942-NEXT: s_cmp_eq_u32 s3, s1 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], -1 @@ -990,12 +969,10 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1100-NEXT: s_mov_b32 s2, 32 -; GFX1100-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX1100-NEXT: s_getpc_b64 s[2:3] ; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 ; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 -; GFX1100-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1100-NEXT: s_cmp_eq_u32 s3, s1 ; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX1100-NEXT: s_mov_b32 s0, -1 @@ -1060,9 +1037,6 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-NEXT: s_wait_kmcnt 0x0 ; GFX1200-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1200-NEXT: s_mov_b32 s2, 32 -; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX1200-NEXT: s_getpc_b64 s[2:3] ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_sext_i32_i16 s3, s3 @@ -1070,7 +1044,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24 ; GFX1200-NEXT: s_wait_alu 0xfffe -; GFX1200-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1200-NEXT: s_cmp_eq_u32 s3, s1 ; GFX1200-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll index 51caa84450ff3..583b6fe0a81ca 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll @@ -134,57 +134,57 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr ; ; GFX9-LABEL: with_private_to_flat_addrspacecast_cc_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s0, s1, 0 -; GFX9-NEXT: s_cselect_b32 s1, s2, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX9-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel: ; GFX9-ARCH-FLAT: ; %bb.0: -; GFX9-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-ARCH-FLAT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ARCH-FLAT-NEXT: s_cmp_lg_u32 s2, -1 -; GFX9-ARCH-FLAT-NEXT: s_cselect_b32 s0, s1, 0 -; GFX9-ARCH-FLAT-NEXT: s_cselect_b32 s1, s2, 0 -; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-ARCH-FLAT-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9-ARCH-FLAT-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9-ARCH-FLAT-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 ; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX9-ARCH-FLAT-NEXT: s_endpgm ; ; GFX942-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel: ; GFX942-ARCH-FLAT: ; %bb.0: -; GFX942-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-ARCH-FLAT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-ARCH-FLAT-NEXT: s_cmp_lg_u32 s2, -1 -; GFX942-ARCH-FLAT-NEXT: s_cselect_b32 s0, s1, 0 -; GFX942-ARCH-FLAT-NEXT: s_cselect_b32 s1, s2, 0 -; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-ARCH-FLAT-NEXT: s_cmp_lg_u32 s0, -1 +; GFX942-ARCH-FLAT-NEXT: s_cselect_b32 s1, s1, 0 +; GFX942-ARCH-FLAT-NEXT: s_cselect_b32 s0, s0, 0 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 ; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX942-ARCH-FLAT-NEXT: s_endpgm ; ; GFX10-LABEL: with_private_to_flat_addrspacecast_cc_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s0, s2, 0 +; GFX10-NEXT: s_cmp_lg_u32 s0, -1 +; GFX10-NEXT: s_cselect_b32 s0, s0, 0 ; GFX10-NEXT: s_cselect_b32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -533,49 +533,49 @@ define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) noca ; ; GFX9-LABEL: private_constant_expression_use: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX9-ARCH-FLAT-LABEL: private_constant_expression_use: ; GFX9-ARCH-FLAT: ; %bb.0: -; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ARCH-FLAT-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-ARCH-FLAT-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX9-ARCH-FLAT-NEXT: s_endpgm ; ; GFX942-ARCH-FLAT-LABEL: private_constant_expression_use: ; GFX942-ARCH-FLAT: ; %bb.0: -; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s3 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-ARCH-FLAT-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX942-ARCH-FLAT-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX942-ARCH-FLAT-NEXT: s_endpgm ; ; GFX10-LABEL: private_constant_expression_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8 @@ -611,48 +611,48 @@ define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) ; ; GFX9-LABEL: calls_intrin_ascast_cc_kernel: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, 7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX9-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel: ; GFX9-ARCH-FLAT: ; %bb.0: -; GFX9-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-ARCH-FLAT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 ; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 ; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX9-ARCH-FLAT-NEXT: s_endpgm ; ; GFX942-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel: ; GFX942-ARCH-FLAT: ; %bb.0: -; GFX942-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX942-ARCH-FLAT-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 ; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 ; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX942-ARCH-FLAT-NEXT: s_endpgm ; ; GFX10-LABEL: calls_intrin_ascast_cc_kernel: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX10-NEXT: v_mov_b32_e32 v2, 7 +; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, 7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir index 9345e92789327..04cb0b14679bb 100644 --- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir +++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir @@ -72,7 +72,7 @@ body: | ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY3]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -80,7 +80,7 @@ body: | undef %3.sub0:areg_96 = COPY %0 %3.sub1:areg_96 = COPY %1 %3.sub2:areg_96 = COPY %2 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -101,7 +101,7 @@ body: | ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, [[COPY3]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -109,7 +109,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0 %3.sub1:areg_96_align2 = COPY %1 %3.sub2:areg_96_align2 = COPY %2 - INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -128,13 +128,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0 %2.sub2_sub3:areg_128 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -153,13 +153,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0 %2.sub2_sub3:areg_128_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -203,13 +203,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0 %2.sub1_sub2:areg_96 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -253,13 +253,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vgpr_32 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0 %2.sub2:areg_96 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -350,12 +350,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96 = COPY %0 %1.sub1:areg_96 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %1 SI_RETURN ... @@ -373,12 +373,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_96_align2 = COPY %0 %1.sub1:areg_96_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, %1 SI_RETURN ... @@ -398,14 +398,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0 %1.sub1:areg_128 = COPY %0 %1.sub2:areg_128 = COPY %0 %1.sub3:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -425,14 +425,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0 %1.sub1:areg_128_align2 = COPY %0 %1.sub2:areg_128_align2 = COPY %0 %1.sub3:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -585,7 +585,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -593,7 +593,7 @@ body: | undef %3.sub0:areg_96 = COPY %0.sub0 %3.sub1:areg_96 = COPY %0.sub1 %3.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -614,7 +614,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -622,7 +622,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0.sub0 %3.sub1:areg_96_align2 = COPY %0.sub1 %3.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -641,13 +641,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -668,13 +668,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1 %0.sub1:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0 %2.sub2_sub3:areg_128_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -718,13 +718,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 =COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub0 %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -768,13 +768,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1 %2.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -841,13 +841,13 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_96 = COPY %0.sub0 %1.sub1:areg_96 = COPY %0.sub0 %1.sub2:areg_96 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %1 SI_RETURN ... @@ -865,12 +865,12 @@ body: | ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_96_align2 = COPY %0.sub0 %1.sub1:areg_96_align2 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, %1 SI_RETURN ... @@ -890,14 +890,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0.sub0 %1.sub1:areg_128 = COPY %0.sub0 %1.sub2:areg_128 = COPY %0.sub0 %1.sub3:areg_128 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -917,14 +917,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0.sub0 %1.sub1:areg_128_align2 = COPY %0.sub0 %1.sub2:areg_128_align2 = COPY %0.sub0 %1.sub3:areg_128_align2 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -995,7 +995,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1:vreg_96 = COPY $vgpr1 @@ -1003,7 +1003,7 @@ body: | undef %3.sub0:areg_96 = COPY %0.sub0 %3.sub1:areg_96 = COPY %0.sub1 %3.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -1024,7 +1024,7 @@ body: | ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96_align2 = COPY $vgpr0 %0.sub1:vreg_96_align2 = COPY $vgpr1 @@ -1032,7 +1032,7 @@ body: | undef %3.sub0:areg_96_align2 = COPY %0.sub0 %3.sub1:areg_96_align2 = COPY %0.sub1 %3.sub2:areg_96_align2 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -1051,13 +1051,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1076,13 +1076,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -1126,13 +1126,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub0 %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1150,13 +1150,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_96 = COPY $vgpr0 %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2 undef %2.sub0:areg_96 = COPY %0.sub2 %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1201,13 +1201,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1 %0.sub2:vreg_96 = COPY $vgpr2 undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1 %2.sub2:areg_96 = COPY %0.sub2 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %2 SI_RETURN ... @@ -1316,11 +1316,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 %3:areg_96 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 5832713 /* reguse:AReg_96 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_96 */, %3 SI_RETURN ... @@ -1337,11 +1337,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2 %3:areg_96_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96_Align2 */, %3 + INLINEASM &"; use $0", 0 /* attdialect */, 6357001 /* reguse:AReg_96_Align2 */, %3 SI_RETURN ... @@ -1358,11 +1358,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 7733257 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 7929865 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1379,11 +1379,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll index 31c23b94a8de8..66d99b14e282d 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll @@ -226,9 +226,8 @@ define void @private_alloca_to_flat(ptr %ptr) { ; GISEL-ASM-LABEL: private_alloca_to_flat: ; GISEL-ASM: ; %bb.0: ; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base ; GISEL-ASM-NEXT: s_lshr_b32 s4, s32, 6 -; GISEL-ASM-NEXT: s_mov_b64 s[6:7], src_private_base -; GISEL-ASM-NEXT: s_mov_b32 s5, s7 ; GISEL-ASM-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 ; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5 @@ -330,21 +329,21 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) { ; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; DAGISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split ; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5] -; DAGISEL-ASM-NEXT: s_xor_b64 s[6:7], vcc, -1 -; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], 0 -; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base +; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base +; DAGISEL-ASM-NEXT: s_xor_b64 s[8:9], vcc, -1 +; DAGISEL-ASM-NEXT: s_mov_b64 s[6:7], 0 ; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 ; DAGISEL-ASM-NEXT: .LBB11_3: ; %finally ; DAGISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1 -; DAGISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[6:7] -; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 +; DAGISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[8:9] +; DAGISEL-ASM-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] +; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s5 ; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2 ; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) -; DAGISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[4:5] +; DAGISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[6:7] ; DAGISEL-ASM-NEXT: s_cbranch_execnz .LBB11_3 ; DAGISEL-ASM-NEXT: ; %bb.4: ; %end -; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5] +; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7] ; DAGISEL-ASM-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index ee87c65c00def..1a4a54b81c78f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -254,8 +254,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -447,8 +447,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -846,8 +846,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -900,8 +900,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1070,8 +1070,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1124,8 +1124,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1288,8 +1288,8 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1479,8 +1479,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1535,8 +1535,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2016,8 +2016,8 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2070,8 +2070,8 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2579,8 +2579,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2772,8 +2772,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3171,8 +3171,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3225,8 +3225,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3395,8 +3395,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3449,8 +3449,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3613,8 +3613,8 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3804,8 +3804,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3860,8 +3860,8 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4024,8 +4024,8 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4215,8 +4215,8 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4271,8 +4271,8 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6201,9 +6201,9 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8 ; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -6369,8 +6369,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6427,8 +6427,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6662,10 +6662,10 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_movk_i32 s0, 0xf800 -; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_movk_i32 s2, 0xf800 +; GFX942-NEXT: s_mov_b32 s3, -1 ; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -6831,8 +6831,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6889,8 +6889,8 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -7552,9 +7552,9 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8 ; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -7715,8 +7715,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -7771,8 +7771,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8001,10 +8001,10 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_movk_i32 s0, 0xf800 -; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_movk_i32 s2, 0xf800 +; GFX942-NEXT: s_mov_b32 s3, -1 ; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -8165,8 +8165,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8221,8 +8221,8 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index b8a2476dc19b4..59b0537b817d2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -3255,9 +3255,9 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -3397,8 +3397,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3438,8 +3438,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3667,10 +3667,10 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_movk_i32 s0, 0xf800 -; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_movk_i32 s2, 0xf800 +; GFX942-NEXT: s_mov_b32 s3, -1 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -3810,8 +3810,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3851,8 +3851,8 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4465,9 +4465,9 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -4605,8 +4605,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4645,8 +4645,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4870,10 +4870,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_movk_i32 s0, 0xf800 -; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_movk_i32 s2, 0xf800 +; GFX942-NEXT: s_mov_b32 s3, -1 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -5011,8 +5011,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5051,8 +5051,8 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 9830e48a86f06..c9c9f332fe391 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -3255,9 +3255,9 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -3397,8 +3397,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3438,8 +3438,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3667,10 +3667,10 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_movk_i32 s0, 0xf800 -; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_movk_i32 s2, 0xf800 +; GFX942-NEXT: s_mov_b32 s3, -1 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -3810,8 +3810,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3851,8 +3851,8 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4465,9 +4465,9 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -4605,8 +4605,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4645,8 +4645,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4870,10 +4870,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_movk_i32 s0, 0xf800 -; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_movk_i32 s2, 0xf800 +; GFX942-NEXT: s_mov_b32 s3, -1 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -5011,8 +5011,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5051,8 +5051,8 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 2c1970220c374..587c2ea885077 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -3711,9 +3711,9 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -3871,8 +3871,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3922,8 +3922,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4157,10 +4157,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_movk_i32 s0, 0xf800 -; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_movk_i32 s2, 0xf800 +; GFX942-NEXT: s_mov_b32 s3, -1 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -4318,8 +4318,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4369,8 +4369,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -5019,9 +5019,9 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b64 s[0:1], 0x7f8 -; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -5175,8 +5175,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5225,8 +5225,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5455,10 +5455,10 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_movk_i32 s0, 0xf800 -; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-NEXT: s_movk_i32 s2, 0xf800 +; GFX942-NEXT: s_mov_b32 s3, -1 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -5612,8 +5612,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5662,8 +5662,8 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index f8c2ddf0d7d3c..082050877e0bb 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -637,8 +637,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -824,11 +824,11 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -995,8 +995,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -1028,9 +1028,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -1150,11 +1149,11 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -1497,8 +1496,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -1688,11 +1687,11 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -1869,8 +1868,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -1905,9 +1904,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -2038,11 +2036,11 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -2393,8 +2391,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -2586,11 +2584,11 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -2769,8 +2767,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -2807,9 +2805,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -2940,11 +2937,11 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -3299,8 +3296,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -3492,11 +3489,11 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -3675,8 +3672,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -3712,9 +3709,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -3846,11 +3842,11 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -4203,8 +4199,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -4396,11 +4392,11 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -4579,8 +4575,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -4616,9 +4612,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -4750,11 +4745,11 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -5107,8 +5102,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -5300,11 +5295,11 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -5483,8 +5478,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -5520,9 +5515,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -5654,11 +5648,11 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -5971,8 +5965,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -6156,11 +6150,11 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -6333,8 +6327,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -6370,9 +6364,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -6496,11 +6489,11 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -6813,8 +6806,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -6998,11 +6991,11 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -7175,8 +7168,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -7212,9 +7205,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -7338,11 +7330,11 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -7655,8 +7647,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -7840,11 +7832,11 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -8017,8 +8009,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -8054,9 +8046,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -8180,11 +8171,11 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -8497,8 +8488,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -8682,11 +8673,11 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -8859,8 +8850,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -8896,9 +8887,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -9022,11 +9012,11 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -9393,8 +9383,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -9594,14 +9584,14 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -9788,8 +9778,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 @@ -9831,9 +9821,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -9969,11 +9958,11 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 @@ -10299,8 +10288,8 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -10498,11 +10487,11 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -10683,8 +10672,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -10720,9 +10709,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -10852,11 +10840,11 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -11180,8 +11168,8 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -11385,11 +11373,11 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -11576,8 +11564,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -11614,9 +11602,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: s_nop 0 -; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -11751,11 +11738,11 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 +; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -11841,8 +11828,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 ; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 @@ -11899,8 +11886,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s0, 0x50 -; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 ; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -11958,8 +11945,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -12006,8 +11993,8 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -12061,8 +12048,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 ; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 @@ -12121,8 +12108,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s0, 0x50 -; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 ; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_6 @@ -12177,8 +12164,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -12225,8 +12212,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB111_6 @@ -12355,8 +12342,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -12388,8 +12375,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 -; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 ; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -12506,8 +12493,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -12540,8 +12527,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 -; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 ; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB113_2 @@ -12656,8 +12643,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -12689,8 +12676,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 -; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 ; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -12807,8 +12794,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 -; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] @@ -12841,8 +12828,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 -; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 ; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB115_2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index d9a596283db1e..1f105e8dd8ba5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -106,11 +106,11 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -401,17 +401,17 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_add_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB2_3 @@ -555,15 +555,15 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB3_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -696,9 +696,9 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -979,15 +979,16 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_add_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB6_3 @@ -1267,11 +1268,11 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -1556,17 +1557,17 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_and_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB10_3 @@ -1707,15 +1708,15 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_and_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB11_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1845,9 +1846,9 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -2122,15 +2123,16 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_and_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB14_3 @@ -2408,11 +2410,11 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -2703,17 +2705,17 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_sub_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB18_3 @@ -2857,15 +2859,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB19_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -2998,9 +3000,9 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -3281,15 +3283,16 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_sub_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB22_3 @@ -3571,11 +3574,11 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -3868,17 +3871,17 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_max_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB26_3 @@ -4024,15 +4027,15 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_max_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB27_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -4165,9 +4168,9 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -4450,15 +4453,16 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_max_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB30_3 @@ -4742,11 +4746,11 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -5039,17 +5043,17 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umax_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB34_3 @@ -5195,15 +5199,15 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB35_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -5336,9 +5340,9 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -5621,15 +5625,16 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umax_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB38_3 @@ -5913,11 +5918,11 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -6210,17 +6215,17 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_min_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB42_3 @@ -6366,15 +6371,15 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_min_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB43_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -6507,9 +6512,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -6792,15 +6797,16 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_min_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB46_3 @@ -7084,11 +7090,11 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -7381,17 +7387,17 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_umin_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB50_3 @@ -7537,15 +7543,15 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB51_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -7678,9 +7684,9 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -7963,15 +7969,16 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_umin_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB54_3 @@ -8253,11 +8260,11 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -8542,17 +8549,17 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-LABEL: atomic_or_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB58_3 @@ -8693,15 +8700,15 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_or_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB59_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -8831,9 +8838,9 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -9108,15 +9115,16 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-LABEL: atomic_or_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB62_3 @@ -9384,11 +9392,11 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -9510,11 +9518,11 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -9636,11 +9644,11 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -9911,17 +9919,17 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB68_3 @@ -10057,15 +10065,15 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB69_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -10186,9 +10194,9 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -10449,15 +10457,16 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB72_3 @@ -10727,11 +10736,11 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -11016,17 +11025,17 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_xor_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB76_3 @@ -11167,15 +11176,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB77_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -11305,9 +11314,9 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -11582,15 +11591,16 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_xor_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB80_3 @@ -12745,15 +12755,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB93_3 @@ -12908,14 +12918,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3] ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s3, s5 +; GFX12-NEXT: s_cmp_eq_u32 s3, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_cbranch_vccz .LBB94_2 @@ -13510,12 +13520,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3] -; GFX12-NEXT: s_cmp_eq_u32 s3, s5 +; GFX12-NEXT: s_cmp_eq_u32 s3, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -14071,11 +14081,11 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -14380,17 +14390,17 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_inc_i64_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB109_3 @@ -14541,15 +14551,15 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB110_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14689,9 +14699,9 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_inc_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -14986,15 +14996,16 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_inc_i64_incr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB113_3 @@ -15296,11 +15307,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 @@ -15625,17 +15636,17 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-LABEL: atomic_dec_i64_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB117_3 @@ -15797,15 +15808,15 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 -; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 ; GFX12-NEXT: s_cselect_b32 s6, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX12-NEXT: s_cbranch_vccz .LBB118_2 ; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15954,9 +15965,9 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX12-LABEL: atomic_dec_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -16271,15 +16282,16 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-LABEL: atomic_dec_i64_decr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_cmp_eq_u32 s1, s7 ; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX12-NEXT: s_mov_b32 s4, -1 ; GFX12-NEXT: s_cbranch_vccnz .LBB121_3 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 524100c5b7a25..9e27f6badfdac 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -187,8 +187,8 @@ define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -422,8 +422,8 @@ define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -660,8 +660,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -897,8 +897,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -1010,8 +1010,8 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1126,8 +1126,8 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1347,8 +1347,8 @@ define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1582,8 +1582,8 @@ define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1820,8 +1820,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -2057,8 +2057,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -2170,8 +2170,8 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2286,8 +2286,8 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2537,8 +2537,8 @@ define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2797,8 +2797,8 @@ define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3063,8 +3063,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -3320,8 +3320,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -3448,8 +3448,8 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3578,8 +3578,8 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3924,8 +3924,8 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4284,8 +4284,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -4656,8 +4656,8 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -5017,8 +5017,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -5159,8 +5159,8 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5289,8 +5289,8 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -5635,8 +5635,8 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5995,8 +5995,8 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6357,8 +6357,8 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -6706,8 +6706,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -6846,8 +6846,8 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6976,8 +6976,8 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -7342,8 +7342,8 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -7726,8 +7726,8 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -8112,8 +8112,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -8485,8 +8485,8 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -8675,8 +8675,8 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8871,8 +8871,8 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -9237,8 +9237,8 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -9597,8 +9597,8 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -9959,8 +9959,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -10308,8 +10308,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -10448,8 +10448,8 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10578,8 +10578,8 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -10924,8 +10924,8 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -11284,8 +11284,8 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -11646,8 +11646,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -11995,8 +11995,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -12135,8 +12135,8 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12265,8 +12265,8 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -12611,8 +12611,8 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12971,8 +12971,8 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -13358,8 +13358,8 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -13737,8 +13737,8 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -13939,20 +13939,20 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_mov_b32 s14, -1 ; GCN3-NEXT: s_mov_b32 s15, 0xe00000 ; GCN3-NEXT: s_add_u32 s12, s12, s11 ; GCN3-NEXT: s_addc_u32 s13, s13, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: s_add_u32 s0, s0, 32 -; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: s_mov_b64 s[6:7], src_private_base ; GCN3-NEXT: s_addc_u32 s1, s1, 0 -; GCN3-NEXT: s_cmp_eq_u32 s1, s5 +; GCN3-NEXT: s_cmp_eq_u32 s1, s7 ; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN3-NEXT: s_mov_b64 s[4:5], -1 @@ -14368,18 +14368,18 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_mov_b32 s14, -1 ; GCN3-NEXT: s_mov_b32 s15, 0xe00000 ; GCN3-NEXT: s_add_u32 s12, s12, s11 ; GCN3-NEXT: s_addc_u32 s13, s13, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base -; GCN3-NEXT: s_addc_u32 s1, s1, s7 -; GCN3-NEXT: s_cmp_eq_u32 s1, s5 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_mov_b64 s[6:7], src_private_base +; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_cmp_eq_u32 s1, s7 ; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN3-NEXT: s_mov_b64 s[4:5], -1 @@ -14734,8 +14734,8 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -14864,8 +14864,8 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -15210,8 +15210,8 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -15570,8 +15570,8 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -15957,8 +15957,8 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -16336,8 +16336,8 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -16538,20 +16538,20 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_mov_b32 s14, -1 ; GCN3-NEXT: s_mov_b32 s15, 0xe00000 ; GCN3-NEXT: s_add_u32 s12, s12, s11 ; GCN3-NEXT: s_addc_u32 s13, s13, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: s_add_u32 s0, s0, 32 -; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: s_mov_b64 s[6:7], src_private_base ; GCN3-NEXT: s_addc_u32 s1, s1, 0 -; GCN3-NEXT: s_cmp_eq_u32 s1, s5 +; GCN3-NEXT: s_cmp_eq_u32 s1, s7 ; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN3-NEXT: s_mov_b64 s[4:5], -1 @@ -17124,8 +17124,8 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -17254,8 +17254,8 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -17600,8 +17600,8 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -17960,8 +17960,8 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -18347,8 +18347,8 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -18726,8 +18726,8 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -18871,8 +18871,8 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -19001,8 +19001,8 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -19347,8 +19347,8 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -19707,8 +19707,8 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -20094,8 +20094,8 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -20473,8 +20473,8 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -20675,20 +20675,20 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GCN3-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_mov_b32 s14, -1 ; GCN3-NEXT: s_mov_b32 s15, 0xe00000 ; GCN3-NEXT: s_add_u32 s12, s12, s11 ; GCN3-NEXT: s_addc_u32 s13, s13, 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GCN3-NEXT: s_add_u32 s0, s0, s6 -; GCN3-NEXT: s_addc_u32 s1, s1, s7 +; GCN3-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GCN3-NEXT: s_add_u32 s0, s0, s4 +; GCN3-NEXT: s_addc_u32 s1, s1, s5 ; GCN3-NEXT: s_add_u32 s0, s0, 32 -; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: s_mov_b64 s[6:7], src_private_base ; GCN3-NEXT: s_addc_u32 s1, s1, 0 -; GCN3-NEXT: s_cmp_eq_u32 s1, s5 +; GCN3-NEXT: s_cmp_eq_u32 s1, s7 ; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN3-NEXT: s_mov_b64 s[4:5], -1 @@ -21101,9 +21101,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: s_mov_b32 s15, 0xe00000 ; GCN3-NEXT: s_add_u32 s12, s12, s11 ; GCN3-NEXT: s_addc_u32 s13, s13, 0 -; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: s_mov_b64 s[6:7], src_private_base ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_cmp_eq_u32 s1, s5 +; GCN3-NEXT: s_cmp_eq_u32 s1, s7 ; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN3-NEXT: s_mov_b64 s[4:5], -1 @@ -21457,8 +21457,8 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -21587,8 +21587,8 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -21958,8 +21958,8 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -22348,8 +22348,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -22745,8 +22745,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -23130,8 +23130,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_addc_u32 s35, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] @@ -23282,8 +23282,8 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -23421,8 +23421,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -23810,8 +23810,8 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] @@ -24223,8 +24223,8 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -24659,8 +24659,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s38, s4, 32 -; GCN3-NEXT: s_addc_u32 s39, s5, 0 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_addc_u32 s39, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s39, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] @@ -25091,8 +25091,8 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: s_add_u32 s38, s4, 32 -; GCN3-NEXT: s_addc_u32 s39, s5, 0 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_addc_u32 s39, s5, 0 ; GCN3-NEXT: s_cmp_eq_u32 s39, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] @@ -25253,8 +25253,8 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -25397,8 +25397,8 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 -; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll index 9db760077a853..872c2cf569dcc 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll @@ -6,21 +6,21 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; CHECK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 +; CHECK-NEXT: s_load_dword s8, s[4:5], 0x24 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; CHECK-NEXT: s_mov_b32 s14, -1 ; CHECK-NEXT: s_mov_b32 s15, 0xe00000 ; CHECK-NEXT: s_add_u32 s12, s12, s11 ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s7, s6, 31 -; CHECK-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; CHECK-NEXT: s_add_u32 s0, s0, s6 -; CHECK-NEXT: s_addc_u32 s1, s1, s7 +; CHECK-NEXT: s_ashr_i32 s9, s8, 31 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; CHECK-NEXT: s_add_u32 s0, s0, s4 +; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: s_add_u32 s0, s0, -8 -; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base +; CHECK-NEXT: s_mov_b64 s[6:7], src_shared_base ; CHECK-NEXT: s_addc_u32 s1, s1, -1 -; CHECK-NEXT: s_cmp_eq_u32 s1, s5 +; CHECK-NEXT: s_cmp_eq_u32 s1, s7 ; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_mov_b64 s[4:5], -1 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index ec80efc5f0362..2daed9b69384f 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -56,19 +56,19 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V4-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9V4-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 -; GFX9V4-NEXT: s_cselect_b32 s2, s3, 0 -; GFX9V4-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 -; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 -; GFX9V4-NEXT: v_mov_b32_e32 v1, s2 -; GFX9V4-NEXT: s_cselect_b32 s0, s5, 0 -; GFX9V4-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9V4-NEXT: s_cmp_lg_u32 s4, -1 +; GFX9V4-NEXT: s_cselect_b32 s0, s1, 0 +; GFX9V4-NEXT: s_cselect_b32 s1, s4, 0 +; GFX9V4-NEXT: s_cmp_lg_u32 s5, -1 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s1 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s0 +; GFX9V4-NEXT: s_cselect_b32 s0, s3, 0 +; GFX9V4-NEXT: s_cselect_b32 s1, s5, 0 ; GFX9V4-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V4-NEXT: v_mov_b32_e32 v3, s0 ; GFX9V4-NEXT: flat_store_dword v[0:1], v4 @@ -80,19 +80,19 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9V5-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9V5-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 -; GFX9V5-NEXT: s_cselect_b32 s2, s3, 0 -; GFX9V5-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 -; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 -; GFX9V5-NEXT: v_mov_b32_e32 v1, s2 -; GFX9V5-NEXT: s_cselect_b32 s0, s5, 0 -; GFX9V5-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9V5-NEXT: s_cmp_lg_u32 s4, -1 +; GFX9V5-NEXT: s_cselect_b32 s0, s1, 0 +; GFX9V5-NEXT: s_cselect_b32 s1, s4, 0 +; GFX9V5-NEXT: s_cmp_lg_u32 s5, -1 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s1 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s0 +; GFX9V5-NEXT: s_cselect_b32 s0, s3, 0 +; GFX9V5-NEXT: s_cselect_b32 s1, s5, 0 ; GFX9V5-NEXT: v_mov_b32_e32 v2, s1 ; GFX9V5-NEXT: v_mov_b32_e32 v3, s0 ; GFX9V5-NEXT: flat_store_dword v[0:1], v4 @@ -136,10 +136,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; ; GFX9V4-LABEL: llvm_amdgcn_is_shared: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9V4-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9V4-NEXT: s_load_dword s0, s[8:9], 0x4 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 +; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 ; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -148,10 +148,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; ; GFX9V5-LABEL: llvm_amdgcn_is_shared: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9V5-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9V5-NEXT: s_load_dword s0, s[8:9], 0x4 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 +; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 ; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off @@ -190,10 +190,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; ; GFX9V4-LABEL: llvm_amdgcn_is_private: ; GFX9V4: ; %bb.0: -; GFX9V4-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9V4-NEXT: s_load_dword s0, s[8:9], 0x4 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V4-NEXT: s_cmp_eq_u32 s2, s1 +; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 ; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V4-NEXT: global_store_dword v[0:1], v0, off @@ -202,10 +202,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; ; GFX9V5-LABEL: llvm_amdgcn_is_private: ; GFX9V5: ; %bb.0: -; GFX9V5-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9V5-NEXT: s_load_dword s0, s[8:9], 0x4 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX9V5-NEXT: s_cmp_eq_u32 s2, s1 +; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 ; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9V5-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir index ccfb8f1d1fe9f..65f373bd1cfc8 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir @@ -486,7 +486,7 @@ body: | ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 38797321 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_ENDPGM 0 bb.0: S_NOP 0, implicit-def $agpr0 @@ -516,7 +516,7 @@ body: | S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 + INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 38797321 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 S_ENDPGM 0 ... @@ -1368,7 +1368,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38797321 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1408,7 +1408,7 @@ body: | undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38797321 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 @@ -1726,7 +1726,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38797321 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1763,7 +1763,7 @@ body: | undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38600713 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 38797321 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index aa75dd1386396..cb2a4ee584594 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9043978 /* regdef:SGPR_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9240586 /* regdef:SGPR_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9043977 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9043978 /* regdef:SGPR_128 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9240586 /* regdef:SGPR_128 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9043977 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:VReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:VReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:VReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7405577 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7733258 /* regdef:AReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:AReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7733257 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8060938 /* regdef:AReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:AReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 9e1815b48abfd..0d6f84edcb487 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -46,12 +46,12 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dword v[0:1], v0, off @@ -79,13 +79,12 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX10-LABEL: is_private_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: global_store_dword v[0:1], v0, off @@ -93,14 +92,14 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX11-LABEL: is_private_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -156,10 +155,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX9-SDAG-LABEL: is_private_sgpr: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-SDAG-NEXT: s_load_dword s0, s[8:9], 0x4 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_cmp_eq_u32 s2, s1 +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s0, s1 ; GFX9-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX9-SDAG-NEXT: s_cbranch_vccnz .LBB1_2 @@ -190,10 +189,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX9-GISEL-LABEL: is_private_sgpr: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, s1 ; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %bb0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -204,10 +203,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX10-LABEL: is_private_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s1, s3 +; GFX10-NEXT: s_cmp_lg_u32 s3, s1 ; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %bb0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -218,10 +217,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX11-LABEL: is_private_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s1, s3 +; GFX11-NEXT: s_cmp_lg_u32 s3, s1 ; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index f1dcc93172fb1..63333ed165a32 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -81,12 +81,12 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s1, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dword v[0:1], v0, off @@ -94,15 +94,14 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX1250-LABEL: is_local_vgpr: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[2:3] scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX1250-NEXT: global_store_b32 v[0:1], v0, off ; GFX1250-NEXT: s_endpgm @@ -129,13 +128,12 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX10-LABEL: is_local_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: global_store_dword v[0:1], v0, off @@ -143,14 +141,14 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX11-LABEL: is_local_vgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off @@ -240,10 +238,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX9-SDAG-LABEL: is_local_sgpr: ; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dword s2, s[8:9], 0x4 ; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-SDAG-NEXT: s_load_dword s0, s[8:9], 0x4 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_cmp_eq_u32 s2, s1 +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s0, s1 ; GFX9-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-SDAG-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX9-SDAG-NEXT: s_cbranch_vccnz .LBB1_2 @@ -256,10 +254,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX1250-SDAG-LABEL: is_local_sgpr: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s2, s1 +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, s1 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 @@ -291,10 +289,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX9-GISEL-LABEL: is_local_sgpr: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, s1 ; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %bb0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -305,10 +303,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX10-LABEL: is_local_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s1, s3 +; GFX10-NEXT: s_cmp_lg_u32 s3, s1 ; GFX10-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %bb0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -319,10 +317,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX11-LABEL: is_local_sgpr: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s1, s3 +; GFX11-NEXT: s_cmp_lg_u32 s3, s1 ; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 @@ -333,10 +331,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX1250-GISEL-LABEL: is_local_sgpr: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s3, s1 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %bb0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index ecf1d3bcdc86d..d65c6d950058e 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:VReg_128 */, def %25 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:VReg_128 */, def %25 ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25 ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64 */, def %27 ; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) @@ -37,7 +37,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) @@ -61,7 +61,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:VReg_128_Align2 */, def %23 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128_Align2 */, def %23 ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %23 ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %21 ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21 @@ -80,7 +80,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; PEI-GFX90A-NEXT: {{ $}} ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7405578 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir index 038e195742305..7475c15f6357a 100644 --- a/llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir +++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir @@ -1,55 +1,66 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=register-coalescer -show-mc-encoding -o - %s | FileCheck %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=register-coalescer -o - %s | FileCheck %s -# FIXME: These SRC_*_HI registers do not exist, although defined in the register file -# and happily used by the coalescer. The resulting encoding is in fact belong -# to the 64-bit register and corresponding *_LO 32-bit part of it. +# These SRC_*_HI registers do not exist, make sure coalescer does not use it. -# CHECK-LABEL: src_private_base: -# CHECK: s_subb_u32 s0, SRC_PRIVATE_BASE_HI, s1 ; encoding: [0xed,0x01,0x80,0x82] --- name: src_private_base tracksRegLiveness: true body: | bb.0: + ; CHECK-LABEL: name: src_private_base + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $src_private_base + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: $scc = IMPLICIT_DEF + ; CHECK-NEXT: dead [[S_SUBB_U32_:%[0-9]+]]:sreg_32 = S_SUBB_U32 [[COPY]].sub1, [[DEF]].sub1, implicit-def dead $scc, implicit killed $scc %0:sreg_64 = COPY $src_private_base %1:sreg_64 = IMPLICIT_DEF $scc = IMPLICIT_DEF %2:sreg_32 = S_SUBB_U32 killed %0.sub1:sreg_64, %1.sub1:sreg_64, implicit-def dead $scc, implicit killed $scc ... -# CHECK-LABEL: src_private_limit: -# CHECK: s_subb_u32 s0, SRC_PRIVATE_LIMIT_HI, s1 ; encoding: [0xee,0x01,0x80,0x82] --- name: src_private_limit tracksRegLiveness: true body: | bb.0: + ; CHECK-LABEL: name: src_private_limit + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $src_private_limit + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: $scc = IMPLICIT_DEF + ; CHECK-NEXT: dead [[S_SUBB_U32_:%[0-9]+]]:sreg_32 = S_SUBB_U32 [[COPY]].sub1, [[DEF]].sub1, implicit-def dead $scc, implicit killed $scc %0:sreg_64 = COPY $src_private_limit %1:sreg_64 = IMPLICIT_DEF $scc = IMPLICIT_DEF %2:sreg_32 = S_SUBB_U32 killed %0.sub1:sreg_64, %1.sub1:sreg_64, implicit-def dead $scc, implicit killed $scc ... -# CHECK-LABEL: src_shared_base: -# CHECK: s_subb_u32 s0, SRC_SHARED_BASE_HI, s1 ; encoding: [0xeb,0x01,0x80,0x82] --- name: src_shared_base tracksRegLiveness: true body: | bb.0: + ; CHECK-LABEL: name: src_shared_base + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $src_shared_base + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: $scc = IMPLICIT_DEF + ; CHECK-NEXT: dead [[S_SUBB_U32_:%[0-9]+]]:sreg_32 = S_SUBB_U32 [[COPY]].sub1, [[DEF]].sub1, implicit-def dead $scc, implicit killed $scc %0:sreg_64 = COPY $src_shared_base %1:sreg_64 = IMPLICIT_DEF $scc = IMPLICIT_DEF %2:sreg_32 = S_SUBB_U32 killed %0.sub1:sreg_64, %1.sub1:sreg_64, implicit-def dead $scc, implicit killed $scc ... -# CHECK-LABEL: src_shared_limit: -# CHECK: s_subb_u32 s0, SRC_SHARED_LIMIT_HI, s1 ; encoding: [0xec,0x01,0x80,0x82] --- name: src_shared_limit tracksRegLiveness: true body: | bb.0: + ; CHECK-LABEL: name: src_shared_limit + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $src_shared_limit + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: $scc = IMPLICIT_DEF + ; CHECK-NEXT: dead [[S_SUBB_U32_:%[0-9]+]]:sreg_32 = S_SUBB_U32 [[COPY]].sub1, [[DEF]].sub1, implicit-def dead $scc, implicit killed $scc %0:sreg_64 = COPY $src_shared_limit %1:sreg_64 = IMPLICIT_DEF $scc = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir index e9c9170caeac4..909781699289c 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir @@ -19,7 +19,7 @@ body: | ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -30,7 +30,7 @@ body: | %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -172,7 +172,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -183,7 +183,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -208,7 +208,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -219,7 +219,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub1:areg_128_align2 = COPY %4.sub2 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir index b51aad748bc28..16d1424f8f57b 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir @@ -17,7 +17,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -26,7 +26,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -47,7 +47,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -56,7 +56,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -151,7 +151,7 @@ body: | ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -163,7 +163,7 @@ body: | %other_use:vreg_64_align2 = COPY %5.sub0_sub1 %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %8.sub0_sub1:areg_128_align2 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -231,7 +231,7 @@ body: | ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -245,7 +245,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8060937 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir index d86e5e6ec7bac..9553fcc1c51c8 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir @@ -49,6 +49,15 @@ body: | $sgpr2_sgpr3 = COPY killed $sgpr0_sgpr1 ... +--- +name: src_shared_base_to_sgpr64 +body: | + bb.0: + ; GFX9-LABEL: name: src_shared_base_to_sgpr64 + ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $src_shared_base + $sgpr0_sgpr1 = COPY $src_shared_base +... + --- name: sgpr96_aligned_src_dst body: | diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll index 8867e6102406b..d2008be4fd32a 100644 --- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll +++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll @@ -10,17 +10,17 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; CHECK-NEXT: flat_load_dword v42, v[46:47] -; CHECK-NEXT: s_load_dwordx4 s[64:67], s[8:9], 0x8 -; CHECK-NEXT: s_load_dword s68, s[8:9], 0x0 +; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] +; CHECK-NEXT: s_load_dwordx4 s[64:67], s[34:35], 0x8 +; CHECK-NEXT: s_load_dword s68, s[34:35], 0x0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[8:9], src_private_base ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] -; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s68, -1 -; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_cselect_b32 s5, s5, 0 +; CHECK-NEXT: s_cselect_b32 s5, s9, 0 ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_cselect_b32 s6, s68, 0 ; CHECK-NEXT: v_mov_b32_e32 v57, s5 diff --git a/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll b/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll index 8b54f709eec7a..1f13282a1f04c 100644 --- a/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll +++ b/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll @@ -14,17 +14,15 @@ define void @_Z12lane_pc_testj() #0 !dbg !9 { ; GCN-NEXT: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: ; %bb.1: ; %lab +; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: .Ltmp0: ; GCN-NEXT: .loc 0 12 1 prologue_end ; t.cpp:12:1 -; GCN-NEXT: s_mov_b64 s[4:5], src_private_base -; GCN-NEXT: s_mov_b32 s6, 32 -; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 -; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_mov_b32 s5, -1 +; GCN-NEXT: s_mov_b64 s[6:7], src_private_base +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_lshr_b32 s8, s32, 5 -; GCN-NEXT: s_cmp_lg_u32 s8, s5 -; GCN-NEXT: s_cselect_b32 s5, s4, s7 -; GCN-NEXT: s_cselect_b32 s4, s8, s6 +; GCN-NEXT: s_cmp_lg_u32 s8, s6 +; GCN-NEXT: s_cselect_b32 s5, s7, s5 +; GCN-NEXT: s_cselect_b32 s4, s8, s4 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: .loc 0 13 1 ; t.cpp:13:1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected index bd1eb4c4e6d1c..6e9ed581cefe1 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -7,10 +7,10 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8 ; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9 -; CHECK-NEXT: t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<72>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> +; CHECK-NEXT: t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<74>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11> ; CHECK-NEXT: t27: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 ; CHECK-NEXT: t30: i32,ch = BUFFER_LOAD_DWORD_OFFEN TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0 -; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<72>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11> +; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<74>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11> ; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t50, t33 ; CHECK-NEXT: t24: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3> ; CHECK-NEXT: t17: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t24