diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 49e9d6bd73a4c..566a5e412662b 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -932,6 +932,9 @@ class CombinerHelper { // merge_values(_, undef) -> anyext bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo); + // merge_values(_, zero) -> zext + bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 95f3d637da854..87f043979262a 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -865,6 +865,14 @@ def merge_of_x_and_undef : GICombineRule < [{ return Helper.matchMergeXAndUndef(*${MI}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>; +/// Transform merge_x_zero -> zext. +def merge_of_x_and_zero : GICombineRule < + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_CONSTANT $zero, 0), + (G_MERGE_VALUES $root, $x, $zero):$MI, + [{ return Helper.matchMergeXAndZero(*${MI}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>; + def merge_combines: GICombineGroup<[ unmerge_anyext_build_vector, unmerge_merge, @@ -873,7 +881,8 @@ def merge_combines: GICombineGroup<[ unmerge_undef, unmerge_dead_to_trunc, unmerge_zext_to_zext, - merge_of_x_and_undef + merge_of_x_and_undef, + merge_of_x_and_zero ]>; // Under certain conditions, transform: diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp index 8f4095f01be7a..797a1e84e21e3 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp @@ -56,3 +56,31 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI, }; return true; } + +bool CombinerHelper::matchMergeXAndZero(const MachineInstr &MI, + BuildFnTy &MatchInfo) { + const GMerge *Merge = cast(&MI); + + Register Dst = Merge->getReg(0); + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Merge->getSourceReg(0)); + + // No multi-use check. It is a constant. + + // + // %bits_8_15:_(s8) = G_CONSTANT i8 0 + // %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8) + // + // -> + // + // %0:_(s16) = G_ZEXT %bits_0_7:(s8) + // + + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + B.buildZExt(Dst, Merge->getSourceReg(0)); + }; + return true; +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir index d6a79780b8bb1..f427f8648a301 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir @@ -567,3 +567,49 @@ body: | $q0 = COPY %me(s128) $x0 = COPY %def(s64) ... +# Check that we zext the merge +--- +name: test_merge_zero +body: | + bb.1: + ; CHECK-LABEL: name: test_merge_zero + ; CHECK: %opaque:_(s64) = COPY $x0 + ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64) + ; CHECK-NEXT: $q0 = COPY %me(s128) + %opaque:_(s64) = COPY $x0 + %def:_(s64) = G_CONSTANT i64 0 + %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def + $q0 = COPY %me(s128) +... +# Check that we still zext the merge, multi-use +--- +name: test_merge_zero_multi_use +body: | + bb.1: + ; CHECK-LABEL: name: test_merge_zero_multi_use + ; CHECK: %opaque:_(s64) = COPY $x0 + ; CHECK-NEXT: %def:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64) + ; CHECK-NEXT: $q0 = COPY %me(s128) + ; CHECK-NEXT: $x0 = COPY %def(s64) + %opaque:_(s64) = COPY $x0 + %def:_(s64) = G_CONSTANT i64 0 + %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def + $q0 = COPY %me(s128) + $x0 = COPY %def(s64) +... +# Check that we don't zext the merge with one +--- +name: test_merge_one +body: | + bb.1: + ; CHECK-LABEL: name: test_merge_one + ; CHECK: %opaque:_(s64) = COPY $x0 + ; CHECK-NEXT: %def:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64) + ; CHECK-NEXT: $q0 = COPY %me(s128) + %opaque:_(s64) = COPY $x0 + %def:_(s64) = G_CONSTANT i64 1 + %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def + $q0 = COPY %me(s128) +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll index bdfafa89cd047..28ed88f4cf8fb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s ; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s @@ -16,7 +16,7 @@ declare void @llvm.amdgcn.struct.ptr.buffer.store.format.v4i32(<4 x i32>, ptr ad define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_add( ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -26,19 +26,18 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) { ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] -; IR: 9: +; IR-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]] +; IR: [[BB9]]: ; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP11]] -; IR: 11: +; IR-NEXT: br label %[[BB11]] +; IR: [[BB11]]: ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_add: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b32 s6, s5 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 @@ -57,7 +56,7 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) { define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_add_and_format( ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -67,12 +66,12 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) { ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] -; IR: 9: +; IR-NEXT: br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]] +; IR: [[TMP9]]: ; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP11]] -; IR: 11: -; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] +; IR-NEXT: br label %[[BB11]] +; IR: [[BB11]]: +; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ] ; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]] ; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0) @@ -81,9 +80,8 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) { ; GCN-LABEL: atomic_add_and_format: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -114,7 +112,7 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) { define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_sub( ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -124,19 +122,18 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) { ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] -; IR: 9: +; IR-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]] +; IR: [[BB9]]: ; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP11]] -; IR: 11: +; IR-NEXT: br label %[[BB11]] +; IR: [[BB11]]: ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_sub: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b32 s6, s5 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB2_2 @@ -155,7 +152,7 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) { define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_sub_and_format( ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -165,12 +162,12 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) { ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] -; IR: 9: +; IR-NEXT: br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]] +; IR: [[TMP9]]: ; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 [[TMP7]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP11]] -; IR: 11: -; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] +; IR-NEXT: br label %[[BB11]] +; IR: [[BB11]]: +; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ] ; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]] ; IR-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4i32(<4 x i32> [[ARG]], <4 x i32> [[ARG]], i32 [[TMP14]], i32 0, i32 0, i32 0) @@ -179,9 +176,8 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) { ; GCN-LABEL: atomic_sub_and_format: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -212,7 +208,7 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) { define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_xor( ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -223,19 +219,18 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) { ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], 1 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR: 10: +; IR-NEXT: br i1 [[TMP9]], label %[[BB10:.*]], label %[[BB12:.*]] +; IR: [[BB10]]: ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP12]] -; IR: 12: +; IR-NEXT: br label %[[BB12]] +; IR: [[BB12]]: ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_xor: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b32 s6, s5 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB4_2 @@ -255,7 +250,7 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) { define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_xor_and_format( ; IR-SAME: <4 x i32> inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -266,12 +261,12 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) { ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], 1 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR: 10: +; IR-NEXT: br i1 [[TMP9]], label %[[TMP10:.*]], label %[[BB12:.*]] +; IR: [[TMP10]]: ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 [[TMP8]], <4 x i32> [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP12]] -; IR: 12: -; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: br label %[[BB12]] +; IR: [[BB12]]: +; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], %[[TMP10]] ] ; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1 ; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]] @@ -281,9 +276,8 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) { ; GCN-LABEL: atomic_xor_and_format: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -316,7 +310,7 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) { define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_ptr_add( ; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -326,19 +320,18 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg) { ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] -; IR: 9: +; IR-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]] +; IR: [[BB9]]: ; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP11]] -; IR: 11: +; IR-NEXT: br label %[[BB11]] +; IR: [[BB11]]: ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_ptr_add: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b32 s6, s5 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB6_2 @@ -357,7 +350,7 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg) { define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_ptr_add_and_format( ; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -367,12 +360,12 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) { ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] -; IR: 9: +; IR-NEXT: br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]] +; IR: [[TMP9]]: ; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP11]] -; IR: 11: -; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] +; IR-NEXT: br label %[[BB11]] +; IR: [[BB11]]: +; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ] ; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP5]] ; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128 @@ -383,9 +376,8 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) { ; GCN-LABEL: atomic_ptr_add_and_format: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -418,7 +410,7 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) { define amdgpu_cs void @atomic_ptr_sub(ptr addrspace(8) inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_ptr_sub( ; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -428,19 +420,18 @@ define amdgpu_cs void @atomic_ptr_sub(ptr addrspace(8) inreg %arg) { ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] -; IR: 9: +; IR-NEXT: br i1 [[TMP8]], label %[[BB9:.*]], label %[[BB11:.*]] +; IR: [[BB9]]: ; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP11]] -; IR: 11: +; IR-NEXT: br label %[[BB11]] +; IR: [[BB11]]: ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_ptr_sub: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b32 s6, s5 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB8_2 @@ -459,7 +450,7 @@ define amdgpu_cs void @atomic_ptr_sub(ptr addrspace(8) inreg %arg) { define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_ptr_sub_and_format( ; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -469,12 +460,12 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) { ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]]) ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]] -; IR: 9: +; IR-NEXT: br i1 [[TMP8]], label %[[TMP9:.*]], label %[[BB11:.*]] +; IR: [[TMP9]]: ; IR-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub.i32(i32 [[TMP7]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP11]] -; IR: 11: -; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], [[TMP9]] ] +; IR-NEXT: br label %[[BB11]] +; IR: [[BB11]]: +; IR-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP10]], %[[TMP9]] ] ; IR-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP12]]) ; IR-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], [[TMP5]] ; IR-NEXT: [[ARG_INT:%.*]] = ptrtoint ptr addrspace(8) [[ARG]] to i128 @@ -485,9 +476,8 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) { ; GCN-LABEL: atomic_ptr_sub_and_format: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -520,7 +510,7 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) { define amdgpu_cs void @atomic_ptr_xor(ptr addrspace(8) inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_ptr_xor( ; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -531,19 +521,18 @@ define amdgpu_cs void @atomic_ptr_xor(ptr addrspace(8) inreg %arg) { ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], 1 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR: 10: +; IR-NEXT: br i1 [[TMP9]], label %[[BB10:.*]], label %[[BB12:.*]] +; IR: [[BB10]]: ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.xor.i32(i32 [[TMP8]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP12]] -; IR: 12: +; IR-NEXT: br label %[[BB12]] +; IR: [[BB12]]: ; IR-NEXT: ret void ; ; GCN-LABEL: atomic_ptr_xor: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b32 s6, s5 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB10_2 @@ -563,7 +552,7 @@ define amdgpu_cs void @atomic_ptr_xor(ptr addrspace(8) inreg %arg) { define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) { ; IR-LABEL: define amdgpu_cs void @atomic_ptr_xor_and_format( ; IR-SAME: ptr addrspace(8) inreg [[ARG:%.*]]) { -; IR-NEXT: .entry: +; IR-NEXT: [[_ENTRY:.*:]] ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32 @@ -574,12 +563,12 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) { ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 ; IR-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], 1 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0 -; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR: 10: +; IR-NEXT: br i1 [[TMP9]], label %[[TMP10:.*]], label %[[BB12:.*]] +; IR: [[TMP10]]: ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.xor.i32(i32 [[TMP8]], ptr addrspace(8) [[ARG]], i32 0, i32 0, i32 0, i32 0) -; IR-NEXT: br label [[TMP12]] -; IR: 12: -; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: br label %[[BB12]] +; IR: [[BB12]]: +; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[DOTENTRY:%.*]] ], [ [[TMP11]], %[[TMP10]] ] ; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]]) ; IR-NEXT: [[TMP15:%.*]] = and i32 [[TMP5]], 1 ; IR-NEXT: [[TMP16:%.*]] = xor i32 [[TMP14]], [[TMP15]] @@ -591,9 +580,8 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) { ; GCN-LABEL: atomic_ptr_xor_and_format: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b32 s4, s7 ; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s4, v0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir index d649a8ad58b43..17537f1d9a067 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir @@ -13,9 +13,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV1]](s32), [[C]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UV1]](s32) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_CONSTANT i64 32 %2:_(s64) = G_LSHR %0, %1 @@ -34,9 +33,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV1]](s32), [[C]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UV1]](s32) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 32 %2:_(s64) = G_LSHR %0, %1 @@ -57,9 +55,8 @@ body: | ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LSHR]](s32), [[C1]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LSHR]](s32) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 33 %2:_(s64) = G_LSHR %0, %1 @@ -100,9 +97,8 @@ body: | ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LSHR]](s32), [[C1]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LSHR]](s32) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 63 %2:_(s64) = G_LSHR %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 8294dffc09b3c..8ff2f59964ab5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1015,9 +1015,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB36_2 @@ -1039,9 +1038,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB36_2 @@ -1067,9 +1065,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB37_2 @@ -1089,9 +1086,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB37_2 @@ -1117,9 +1113,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB38_2 @@ -1141,9 +1136,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB38_2 @@ -1169,9 +1163,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB39_2 @@ -1191,9 +1184,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB39_2 @@ -1298,9 +1290,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB43_2 @@ -1320,9 +1311,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB43_2 @@ -1549,9 +1539,8 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB51_2 @@ -1570,9 +1559,8 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB51_2 @@ -1596,9 +1584,8 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB52_2 @@ -1617,9 +1604,8 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB52_2 @@ -1643,9 +1629,8 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: s_mov_b32 s2, s1 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX90A-NEXT: s_cbranch_execz .LBB53_2 @@ -1664,9 +1649,8 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: s_mov_b32 s2, s1 ; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX940-NEXT: s_cbranch_execz .LBB53_2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index 6064b17f5f8f3..6459110dd8bbb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -75,42 +75,39 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX908-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec - ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 - ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX908-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY7]], [[COPY8]], implicit $exec + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY9]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX908-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec - ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY10]], [[DEF]], implicit $exec + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX908-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX908-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX908-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX908-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX908-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX908-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX908-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX908-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX908-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX908-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX908-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX908-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX908-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX908-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX908-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX908-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] - ; GFX908-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] - ; GFX908-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec - ; GFX908-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec + ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX908-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX908-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY17]], implicit $exec + ; GFX908-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY18]], implicit $exec ; GFX908-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX908-NEXT: S_BRANCH %bb.3 ; GFX908-NEXT: {{ $}} @@ -150,42 +147,39 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 ; GFX90A-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX90A-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec - ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 - ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY7]], [[COPY8]], implicit $exec + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY9]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec - ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY10]], [[DEF]], implicit $exec + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] - ; GFX90A-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] - ; GFX90A-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec - ; GFX90A-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec + ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX90A-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY17]], implicit $exec + ; GFX90A-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY18]], implicit $exec ; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} @@ -225,42 +219,39 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 ; GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec - ; GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 - ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY7]], [[COPY8]], implicit $exec + ; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY9]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX940-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec - ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY10]], [[DEF]], implicit $exec + ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] - ; GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] - ; GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec - ; GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec + ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY17]], implicit $exec + ; GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY18]], implicit $exec ; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX940-NEXT: S_BRANCH %bb.3 ; GFX940-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index 07c97107d1b71..e935245e30f12 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -67,44 +67,41 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 ; GFX90A-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX90A-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec - ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 - ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY7]], [[COPY8]], implicit $exec + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY9]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec - ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY10]], [[DEF1]], implicit $exec + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY18]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec - ; GFX90A-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] - ; GFX90A-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] - ; GFX90A-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY19]], implicit $exec - ; GFX90A-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY20]], implicit $exec + ; GFX90A-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX90A-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec + ; GFX90A-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec ; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} @@ -129,10 +126,10 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX90A-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec + ; GFX90A-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY20]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY21]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY22]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY21]], [[V_CMP_EQ_U32_e64_]], implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.4 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6 (%ir-block.41): @@ -161,44 +158,41 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 ; GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec - ; GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 - ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY7]], [[COPY8]], implicit $exec + ; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY9]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec - ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY10]], [[DEF1]], implicit $exec + ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY18]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec - ; GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] - ; GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] - ; GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY19]], implicit $exec - ; GFX940-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY20]], implicit $exec + ; GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_2]] + ; GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec + ; GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec ; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX940-NEXT: S_BRANCH %bb.3 ; GFX940-NEXT: {{ $}} @@ -223,10 +217,10 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec ; GFX940-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec + ; GFX940-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY20]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec ; GFX940-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY21]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY22]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY21]], [[V_CMP_EQ_U32_e64_]], implicit $exec ; GFX940-NEXT: S_BRANCH %bb.4 ; GFX940-NEXT: {{ $}} ; GFX940-NEXT: bb.6 (%ir-block.41): diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 0b5706aa45b69..613c73f7b9368 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -2505,20 +2505,18 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) { ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 ; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 ; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0 -; CGP-NEXT: v_mov_b32_e32 v0, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 -; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3 +; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v1 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CGP-NEXT: s_setpc_b64 s[30:31] %num.mask = and i64 %num, 16777215 @@ -2785,52 +2783,49 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-LABEL: v_sdiv_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v5 -; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v5 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 +; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6 +; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 -; CGP-NEXT: v_rcp_f32_e32 v7, v3 -; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v0 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0 -; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 +; CGP-NEXT: v_mul_lo_u32 v6, v6, v5 +; CGP-NEXT: v_rcp_f32_e32 v7, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v0 -; CGP-NEXT: v_mov_b32_e32 v0, v4 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v1, v5 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v1 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 -; CGP-NEXT: v_cndmask_b32_e32 v4, v1, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0 -; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v5 -; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v7, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v4 -; CGP-NEXT: v_mov_b32_e32 v7, v1 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v6 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v7 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 +; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 +; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v1 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 +; CGP-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 +; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v2 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v5, v3 +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v6, v1 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v7 +; CGP-NEXT: v_mul_lo_u32 v5, v2, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v4 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; CGP-NEXT: s_setpc_b64 s[30:31] %num.mask = and <2 x i64> %num, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 3ed864d463ee9..d5e22df59ccb3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3005,11 +3005,9 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) { ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 ; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 ; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0 -; CGP-NEXT: v_mov_b32_e32 v0, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 -; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v3 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 @@ -3282,45 +3280,43 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-LABEL: v_srem_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v5 -; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v5 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 +; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6 +; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 -; CGP-NEXT: v_rcp_f32_e32 v7, v3 +; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v0 ; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0 -; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 +; CGP-NEXT: v_mul_lo_u32 v6, v6, v5 +; CGP-NEXT: v_rcp_f32_e32 v7, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v0 -; CGP-NEXT: v_mov_b32_e32 v0, v4 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v6 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v4, v1, v5 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 +; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v1 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v3, v5 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v1, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v0, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 147ddc4d4b75b..b2f178c6c1041 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1760,8 +1760,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 -; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 @@ -1870,8 +1869,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 -; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3 @@ -1884,8 +1882,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 -; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v14 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v15 ; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3 @@ -1903,8 +1900,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v22, v2, v3 -; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) @@ -3859,8 +3855,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 -; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 @@ -3969,8 +3964,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 -; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3 @@ -3983,8 +3977,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 -; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v20 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v21 ; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3 @@ -4002,8 +3995,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v14, v2, v3 -; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) @@ -4539,8 +4531,7 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s5, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 -; GFX9-G-O0-NEXT: s_mov_b32 s5, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[5:6] ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 @@ -4621,8 +4612,7 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 -; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[4:5]