-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU] Fix codegen to emit COPY instead of S_MOV_B64 for aperture regs #158754
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Fix codegen to emit COPY instead of S_MOV_B64 for aperture regs #158754
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-debuginfo @llvm/pr-subscribers-llvm-globalisel Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 372.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158754.diff 22 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index d8c4cbbc4fa33..c690b2b7129b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2293,16 +2293,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
!ST.hasGloballyAddressableScratch()) &&
"Cannot use src_private_base with globally addressable scratch!");
- // FIXME: It would be more natural to emit a COPY here, but then copy
- // coalescing would kick in and it would think it's okay to use the "HI"
- // subregister (instead of extracting the HI 32 bits) which is an artificial
- // (unusable) register.
- // Register TableGen definitions would need an overhaul to get rid of the
- // artificial "HI" aperture registers and prevent this kind of issue from
- // happening.
Register Dst = MRI.createGenericVirtualRegister(S64);
MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
- B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
+ B.buildCopy({Dst}, {Register(ApertureRegNo)});
return B.buildUnmerge(S32, Dst).getReg(1);
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9acc4b6de3501..40d313fc244df 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8165,25 +8165,16 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// it returns a wrong value (all zeroes?). The real value is in the upper 32
// bits.
//
- // To work around the issue, directly emit a 64 bit mov from this register
+ // To work around the issue, emit a 64 bit copy from this register
// then extract the high bits. Note that this shouldn't even result in a
// shift being emitted and simply become a pair of registers (e.g.):
// s_mov_b64 s[6:7], src_shared_base
// v_mov_b32_e32 v1, s7
- //
- // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
- // coalescing would kick in and it would think it's okay to use the "HI"
- // subregister directly (instead of extracting the HI 32 bits) which is an
- // artificial (unusable) register.
- // Register TableGen definitions would need an overhaul to get rid of the
- // artificial "HI" aperture registers and prevent this kind of issue from
- // happening.
- SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
- DAG.getRegister(ApertureRegNo, MVT::i64));
- return DAG.getNode(
- ISD::TRUNCATE, DL, MVT::i32,
- DAG.getNode(ISD::SRL, DL, MVT::i64,
- {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
+ SDValue Copy =
+ DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::i64);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
+ DAG.getNode(ISD::SRL, DL, MVT::i64,
+ {Copy, DAG.getConstant(32, DL, MVT::i64)}));
}
// For code object version 5, private_base and shared_base are passed through
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
index b520ce1826ec9..3d224f2f6bf05 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
@@ -9,12 +9,11 @@
define amdgpu_ps void @amdgpu_ps() {
; MESA-LABEL: amdgpu_ps:
; MESA: ; %bb.0:
-; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4
-; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; MESA-NEXT: s_mov_b64 s[0:1], src_private_base
; MESA-NEXT: s_mov_b32 s0, 0
-; MESA-NEXT: s_mov_b64 s[2:3], src_private_base
-; MESA-NEXT: s_mov_b32 s1, s3
+; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4
; MESA-NEXT: v_mov_b32_e32 v0, s0
+; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; MESA-NEXT: v_mov_b32_e32 v2, 0
; MESA-NEXT: v_mov_b32_e32 v1, s1
; MESA-NEXT: flat_store_dword v[0:1], v2
@@ -30,11 +29,10 @@ define amdgpu_ps void @amdgpu_ps() {
; PAL-NEXT: s_waitcnt lgkmcnt(0)
; PAL-NEXT: s_and_b32 s3, s3, 0xffff
; PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
-; PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; PAL-NEXT: s_mov_b64 s[0:1], src_private_base
; PAL-NEXT: s_mov_b32 s0, 0
-; PAL-NEXT: s_mov_b64 s[2:3], src_private_base
-; PAL-NEXT: s_mov_b32 s1, s3
; PAL-NEXT: v_mov_b32_e32 v0, s0
+; PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; PAL-NEXT: v_mov_b32_e32 v1, s1
; PAL-NEXT: flat_store_dword v[0:1], v2
; PAL-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 86766e2904619..9539ec465e02f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -65,52 +65,52 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
;
; GFX9V4-LABEL: addrspacecast:
; GFX9V4: ; %bb.0:
-; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base
-; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX9V4-NEXT: s_mov_b64 s[2:3], src_shared_base
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT: s_mov_b32 s2, s0
-; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1
+; GFX9V4-NEXT: s_mov_b32 s0, s4
+; GFX9V4-NEXT: s_cmp_lg_u32 s4, -1
+; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9V4-NEXT: s_mov_b32 s2, s5
+; GFX9V4-NEXT: s_cmp_lg_u32 s5, -1
+; GFX9V4-NEXT: v_mov_b32_e32 v0, s0
; GFX9V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9V4-NEXT: s_mov_b32 s4, s1
-; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1
-; GFX9V4-NEXT: v_mov_b32_e32 v0, s2
-; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
; GFX9V4-NEXT: v_mov_b32_e32 v2, 1
-; GFX9V4-NEXT: v_mov_b32_e32 v1, s3
+; GFX9V4-NEXT: v_mov_b32_e32 v1, s1
; GFX9V4-NEXT: flat_store_dword v[0:1], v2
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
-; GFX9V4-NEXT: v_mov_b32_e32 v0, s0
+; GFX9V4-NEXT: v_mov_b32_e32 v0, s2
; GFX9V4-NEXT: v_mov_b32_e32 v2, 2
-; GFX9V4-NEXT: v_mov_b32_e32 v1, s1
+; GFX9V4-NEXT: v_mov_b32_e32 v1, s3
; GFX9V4-NEXT: flat_store_dword v[0:1], v2
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
; GFX9V4-NEXT: s_endpgm
;
; GFX9V5-LABEL: addrspacecast:
; GFX9V5: ; %bb.0:
-; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V5-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base
-; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX9V5-NEXT: s_mov_b64 s[2:3], src_shared_base
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT: s_mov_b32 s2, s0
-; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1
+; GFX9V5-NEXT: s_mov_b32 s0, s4
+; GFX9V5-NEXT: s_cmp_lg_u32 s4, -1
+; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9V5-NEXT: s_mov_b32 s2, s5
+; GFX9V5-NEXT: s_cmp_lg_u32 s5, -1
+; GFX9V5-NEXT: v_mov_b32_e32 v0, s0
; GFX9V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9V5-NEXT: s_mov_b32 s4, s1
-; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1
-; GFX9V5-NEXT: v_mov_b32_e32 v0, s2
-; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
; GFX9V5-NEXT: v_mov_b32_e32 v2, 1
-; GFX9V5-NEXT: v_mov_b32_e32 v1, s3
+; GFX9V5-NEXT: v_mov_b32_e32 v1, s1
; GFX9V5-NEXT: flat_store_dword v[0:1], v2
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
-; GFX9V5-NEXT: v_mov_b32_e32 v0, s0
+; GFX9V5-NEXT: v_mov_b32_e32 v0, s2
; GFX9V5-NEXT: v_mov_b32_e32 v2, 2
-; GFX9V5-NEXT: v_mov_b32_e32 v1, s1
+; GFX9V5-NEXT: v_mov_b32_e32 v1, s3
; GFX9V5-NEXT: flat_store_dword v[0:1], v2
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: s_endpgm
@@ -150,10 +150,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
;
; GFX9V4-LABEL: llvm_amdgcn_is_shared:
; GFX9V4: ; %bb.0:
-; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX9V4-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V4-NEXT: s_mov_b64 s[0:1], src_shared_base
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3
+; GFX9V4-NEXT: s_cmp_eq_u32 s3, s1
; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0
; GFX9V4-NEXT: v_mov_b32_e32 v0, s0
; GFX9V4-NEXT: global_store_dword v[0:1], v0, off
@@ -162,10 +162,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
;
; GFX9V5-LABEL: llvm_amdgcn_is_shared:
; GFX9V5: ; %bb.0:
-; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V5-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX9V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V5-NEXT: s_mov_b64 s[0:1], src_shared_base
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3
+; GFX9V5-NEXT: s_cmp_eq_u32 s3, s1
; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0
; GFX9V5-NEXT: v_mov_b32_e32 v0, s0
; GFX9V5-NEXT: global_store_dword v[0:1], v0, off
@@ -206,10 +206,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
;
; GFX9V4-LABEL: llvm_amdgcn_is_private:
; GFX9V4: ; %bb.0:
-; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX9V4-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V4-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT: s_cmp_eq_u32 s1, s3
+; GFX9V4-NEXT: s_cmp_eq_u32 s3, s1
; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0
; GFX9V4-NEXT: v_mov_b32_e32 v0, s0
; GFX9V4-NEXT: global_store_dword v[0:1], v0, off
@@ -218,10 +218,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
;
; GFX9V5-LABEL: llvm_amdgcn_is_private:
; GFX9V5: ; %bb.0:
-; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX9V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX9V5-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT: s_cmp_eq_u32 s1, s3
+; GFX9V5-NEXT: s_cmp_eq_u32 s3, s1
; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0
; GFX9V5-NEXT: v_mov_b32_e32 v0, s0
; GFX9V5-NEXT: global_store_dword v[0:1], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
index d69a3e1a15bbd..4471980c1ba1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
@@ -158,8 +158,8 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5)
- ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_private_base
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
; GFX9-NEXT: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -227,8 +227,8 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3)
- ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -380,16 +380,16 @@ body: |
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1
; GFX9-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>)
; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
- ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV3]](s32)
; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]]
; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]]
; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
- ; GFX9-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
- ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_1]](s64)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_64(s64) = COPY $src_shared_base
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64)
; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[UV5]](s32)
; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]]
; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C1]]
@@ -517,8 +517,8 @@ body: |
; GFX9-LABEL: name: test_addrspacecast_p5_fi_to_p0
; GFX9: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
- ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64(s64) = COPY $src_private_base
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p0)
%0:_(p5) = G_FRAME_INDEX %stack.0
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
index 58f3ffb0492e0..bc341f2baa804 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
@@ -361,8 +361,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__av(ptr %ptr) #0 {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
-; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
@@ -417,8 +417,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__v(ptr %ptr) #0 {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
-; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
@@ -473,8 +473,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_av__a(ptr %ptr) #0 {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
-; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
@@ -538,13 +538,13 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 {
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
@@ -603,13 +603,13 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__v(ptr %ptr) #0 {
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
@@ -659,12 +659,12 @@ define void @flat_atomic_cmpxchg_i64_ret_v_a__v(ptr %ptr) #0 {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; CHECK-NEXT: ;;#ASMSTART
@@ -717,12 +717,12 @@ define void @flat_atomic_cmpxchg_i64_ret_a_v__v(ptr %ptr) #0 {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:1]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
-; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
; CHECK-NEXT: ;;#ASMSTART
@@ -775,8 +775,8 @@ define void @flat_atomic_cmpxchg_i64_ret_v_v__a(ptr %ptr) #0 {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
-; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[2:3]
@@ -836,8 +836,8 @@ define void @flat_atomic_cmpxchg_i64_ret_av_v__av(ptr %ptr) #0 {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
-; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
; ...
[truncated]
|
d3326e1
to
729f294
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
JBTW, now it really does not result in an actual shift as was written in the comment.
729f294
to
49c4a85
Compare
f09aff5
to
1d04421
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
No description provided.