diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 022af501289af..10956861650ab 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -19,6 +19,7 @@ #include "AMDGPU.h" #include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include namespace llvm { @@ -174,24 +175,23 @@ class GCNTTIImpl final : public BasicTTIImplBase { bool isAlwaysUniform(const Value *V) const; bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { - if (ToAS == AMDGPUAS::FLAT_ADDRESS) { - switch (FromAS) { - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS_32BIT: - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::PRIVATE_ADDRESS: - return true; - default: - break; - } + // Address space casts must cast between different address spaces. + if (FromAS == ToAS) return false; - } - if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && - ToAS == AMDGPUAS::CONSTANT_ADDRESS) || - (FromAS == AMDGPUAS::CONSTANT_ADDRESS && - ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)) - return true; + + if (FromAS == AMDGPUAS::FLAT_ADDRESS) + return AMDGPU::isExtendedGlobalAddrSpace(ToAS) || + ToAS == AMDGPUAS::LOCAL_ADDRESS || + ToAS == AMDGPUAS::PRIVATE_ADDRESS; + + if (AMDGPU::isExtendedGlobalAddrSpace(FromAS)) + return AMDGPU::isFlatGlobalAddrSpace(ToAS) || + ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; + + if (FromAS == AMDGPUAS::LOCAL_ADDRESS || + FromAS == AMDGPUAS::PRIVATE_ADDRESS) + return ToAS == AMDGPUAS::FLAT_ADDRESS; + return false; } diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll index 9e2e37a886d1f..3f07b3d94692c 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -1336,8 +1336,8 @@ define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrs ; MAX1024-NEXT: ret void ; ; ALL-LABEL: @memmove_flat_align1_global_align1( -; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[SRC:%.*]] to ptr -; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]] +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(1) +; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP1]] ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ] @@ -1404,8 +1404,8 @@ define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addr ; MAX1024-NEXT: ret void ; ; ALL-LABEL: @memmove_flat_align1_private_align1( -; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[SRC:%.*]] to ptr -; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]] +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(5) +; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[TMP1]] ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ] @@ -1514,7 +1514,59 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1) define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size) { ; OPT-LABEL: @memmove_global_align1_p999_align1( -; OPT-NEXT: call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(999) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false) +; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4 +; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE]], 15 +; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]] +; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0 +; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0 +; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr addrspace(999) +; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(999) [[SRC:%.*]], [[TMP4]] +; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]] +; OPT: memmove_copy_backwards: +; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]] +; OPT: memmove_bwd_residual_loop: +; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ] +; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1 +; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]] +; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(999) [[TMP6]], align 1 +; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_RESIDUAL_INDEX]] +; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(1) [[TMP7]], align 1 +; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]] +; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]] +; OPT: memmove_bwd_middle: +; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] +; OPT: memmove_bwd_main_loop: +; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ] +; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1 +; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[SRC]], i64 [[BWD_MAIN_INDEX]] +; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP10]], align 1 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_MAIN_INDEX]] +; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP11]], align 1 +; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0 +; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] +; OPT: memmove_copy_forward: +; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]] +; OPT: memmove_fwd_main_loop: +; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] +; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[SRC]], i64 [[FWD_MAIN_INDEX]] +; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP13]], align 1 +; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_MAIN_INDEX]] +; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(1) [[TMP14]], align 1 +; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1 +; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]] +; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] +; OPT: memmove_fwd_middle: +; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]] +; OPT: memmove_fwd_residual_loop: +; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ] +; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]] +; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(999) [[TMP17]], align 1 +; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_RESIDUAL_INDEX]] +; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(1) [[TMP18]], align 1 +; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1 +; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]] +; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]] +; OPT: memmove_done: ; OPT-NEXT: ret void ; call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size, i1 false) @@ -1523,7 +1575,59 @@ define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %d define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size) { ; OPT-LABEL: @memmove_p999_align1_p1_align1( -; OPT-NEXT: call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false) +; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4 +; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE]], 15 +; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]] +; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0 +; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0 +; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(1) +; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP4]] +; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]] +; OPT: memmove_copy_backwards: +; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]] +; OPT: memmove_bwd_residual_loop: +; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ] +; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1 +; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]] +; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 1 +; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_RESIDUAL_INDEX]] +; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(999) [[TMP7]], align 1 +; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]] +; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]] +; OPT: memmove_bwd_middle: +; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] +; OPT: memmove_bwd_main_loop: +; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ] +; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1 +; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_MAIN_INDEX]] +; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP10]], align 1 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]] +; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1 +; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0 +; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] +; OPT: memmove_copy_forward: +; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]] +; OPT: memmove_fwd_main_loop: +; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] +; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_MAIN_INDEX]] +; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP13]], align 1 +; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]] +; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1 +; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1 +; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]] +; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] +; OPT: memmove_fwd_middle: +; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]] +; OPT: memmove_fwd_residual_loop: +; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ] +; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]] +; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(1) [[TMP17]], align 1 +; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_RESIDUAL_INDEX]] +; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(999) [[TMP18]], align 1 +; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1 +; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]] +; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]] +; OPT: memmove_done: ; OPT-NEXT: ret void ; call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size, i1 false) @@ -1532,7 +1636,59 @@ define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size) { ; OPT-LABEL: @memmove_p999_align1_p998_align1( -; OPT-NEXT: call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) [[DST:%.*]], ptr addrspace(998) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false) +; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4 +; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE]], 15 +; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]] +; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0 +; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0 +; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(998) +; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(998) [[SRC:%.*]], [[TMP4]] +; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]] +; OPT: memmove_copy_backwards: +; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]] +; OPT: memmove_bwd_residual_loop: +; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ] +; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1 +; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]] +; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(998) [[TMP6]], align 1 +; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_RESIDUAL_INDEX]] +; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(999) [[TMP7]], align 1 +; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]] +; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]] +; OPT: memmove_bwd_middle: +; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] +; OPT: memmove_bwd_main_loop: +; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ] +; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1 +; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(998) [[SRC]], i64 [[BWD_MAIN_INDEX]] +; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP10]], align 1 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]] +; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1 +; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0 +; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] +; OPT: memmove_copy_forward: +; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]] +; OPT: memmove_fwd_main_loop: +; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] +; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(998) [[SRC]], i64 [[FWD_MAIN_INDEX]] +; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP13]], align 1 +; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]] +; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1 +; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1 +; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]] +; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] +; OPT: memmove_fwd_middle: +; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]] +; OPT: memmove_fwd_residual_loop: +; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ] +; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]] +; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(998) [[TMP17]], align 1 +; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_RESIDUAL_INDEX]] +; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(999) [[TMP18]], align 1 +; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1 +; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]] +; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]] +; OPT: memmove_done: ; OPT-NEXT: ret void ; call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size, i1 false) @@ -1726,8 +1882,8 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds ; MAX1024-NEXT: ret void ; ; ALL-LABEL: @memmove_flat_align1_local_align1( -; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr -; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]] +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(3) +; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP1]] ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ] @@ -1761,8 +1917,8 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0 -; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr -; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP4]], [[DST:%.*]] +; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(3) +; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP4]] ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]] ; OPT: memmove_copy_backwards: ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]] diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll new file mode 100644 index 0000000000000..dfa489d317403 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -0,0 +1,2329 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s + +; Check code generation for memmoves with statically unknown size and all +; combinations of the following address spaces: +; destination address space: 0, 1, 3, 5 +; source address space: 0, 1, 3, 4, 5 + +define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p0_p0: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v9, 0 +; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 +; CHECK-NEXT: v_lshrrev_b64 v[6:7], 4, v[4:5] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB0_3 +; CHECK-NEXT: ; %bb.1: ; %Flow37 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB0_10 +; CHECK-NEXT: .LBB0_2: ; %Flow38 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB0_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB0_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v11, v3 +; CHECK-NEXT: v_mov_b32_e32 v13, v1 +; CHECK-NEXT: v_mov_b32_e32 v10, v2 +; CHECK-NEXT: v_mov_b32_e32 v12, v0 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB0_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[10:11] +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 +; CHECK-NEXT: v_add_co_u32 v10, s5, v10, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, 0, v11, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[14:17] +; CHECK-NEXT: v_add_co_u32 v12, s6, v12, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s6, 0, v13, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB0_5 +; CHECK-NEXT: .LBB0_6: ; %Flow32 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_and_saveexec_b32 s8, s4 +; CHECK-NEXT: s_cbranch_execz .LBB0_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_sub_co_u32 v4, s5, v4, v8 +; CHECK-NEXT: v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v5, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v5, s5 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB0_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_ubyte v4, v[2:3] +; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 +; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB0_8 +; CHECK-NEXT: .LBB0_9: ; %Flow30 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: .LBB0_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB0_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4 +; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB0_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_ubyte v12, v[10:11] +; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4 +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s8, s4, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[4:5], v12 +; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB0_12 +; CHECK-NEXT: .LBB0_13: ; %Flow36 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB0_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_lshlrev_b64 v[4:5], 4, v[6:7] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v4, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB0_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; CHECK-NEXT: v_add_co_u32 v0, s4, v0, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s4, -1, v1, s4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB0_15 +; CHECK-NEXT: .LBB0_16: ; %Flow34 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p0_p1: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v9, 0 +; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 +; CHECK-NEXT: v_lshrrev_b64 v[6:7], 4, v[4:5] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB1_3 +; CHECK-NEXT: ; %bb.1: ; %Flow41 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB1_10 +; CHECK-NEXT: .LBB1_2: ; %Flow42 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB1_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB1_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v11, v3 +; CHECK-NEXT: v_mov_b32_e32 v13, v1 +; CHECK-NEXT: v_mov_b32_e32 v10, v2 +; CHECK-NEXT: v_mov_b32_e32 v12, v0 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB1_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx4 v[14:17], v[10:11], off +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 +; CHECK-NEXT: v_add_co_u32 v10, s5, v10, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, 0, v11, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[14:17] +; CHECK-NEXT: v_add_co_u32 v12, s6, v12, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s6, 0, v13, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB1_5 +; CHECK-NEXT: .LBB1_6: ; %Flow36 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_and_saveexec_b32 s8, s4 +; CHECK-NEXT: s_cbranch_execz .LBB1_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_sub_co_u32 v4, s5, v4, v8 +; CHECK-NEXT: v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v5, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v5, s5 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB1_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_ubyte v4, v[2:3], off +; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 +; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB1_8 +; CHECK-NEXT: .LBB1_9: ; %Flow34 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: .LBB1_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB1_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_co_u32 v4, s4, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v3, v11, s4 +; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v1, v11, s4 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB1_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_ubyte v12, v[4:5], off +; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, -1, v5, s4 +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s8, s4, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[10:11], v12 +; CHECK-NEXT: v_add_co_u32 v10, s5, v10, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, -1, v11, s5 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB1_12 +; CHECK-NEXT: .LBB1_13: ; %Flow40 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB1_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_lshlrev_b64 v[4:5], 4, v[6:7] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v4, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; CHECK-NEXT: v_add_co_u32 v0, s4, v0, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s4, -1, v1, s4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB1_15 +; CHECK-NEXT: .LBB1_16: ; %Flow38 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p0_p3: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v7, 7, v3 +; CHECK-NEXT: v_mov_b32_e32 v8, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; CHECK-NEXT: v_lshrrev_b64 v[5:6], 3, v[3:4] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8] +; CHECK-NEXT: v_cndmask_b32_e32 v9, -1, v0, vcc_lo +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6] +; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v9 +; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB2_3 +; CHECK-NEXT: ; %bb.1: ; %Flow40 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB2_10 +; CHECK-NEXT: .LBB2_2: ; %Flow41 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB2_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB2_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v10, v1 +; CHECK-NEXT: v_mov_b32_e32 v9, v0 +; CHECK-NEXT: v_mov_b32_e32 v11, v2 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB2_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_b64 v[12:13], v11 +; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s5, -1, v6, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v11, 8, v11 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[9:10], v[12:13] +; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB2_5 +; CHECK-NEXT: .LBB2_6: ; %Flow35 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_and_saveexec_b32 s8, s4 +; CHECK-NEXT: s_cbranch_execz .LBB2_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_and_b32_e32 v5, -8, v3 +; CHECK-NEXT: v_sub_co_u32 v3, s5, v3, v7 +; CHECK-NEXT: v_subrev_co_ci_u32_e64 v4, s5, 0, v4, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v4, s5 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB2_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_u8 v3, v2 +; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s5, -1, v8, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 +; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB2_8 +; CHECK-NEXT: .LBB2_9: ; %Flow33 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 +; CHECK-NEXT: ; implicit-def: $vgpr2 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execz .LBB2_2 +; CHECK-NEXT: .LBB2_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB2_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v1, s4 +; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 +; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB2_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_u8 v11, v4 +; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s4, -1, v8, s4 +; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8] +; CHECK-NEXT: s_or_b32 s8, s4, s8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[9:10], v11 +; CHECK-NEXT: v_add_co_u32 v9, s5, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, -1, v10, s5 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB2_12 +; CHECK-NEXT: .LBB2_13: ; %Flow39 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB2_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_lshlrev_b64 v[7:8], 3, v[5:6] +; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add3_u32 v2, v3, v2, -8 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v7, v0 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v8, v1, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -8 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; CHECK-NEXT: .LBB2_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_b64 v[3:4], v2 +; CHECK-NEXT: v_add_co_u32 v5, vcc_lo, v5, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v6, vcc_lo +; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; CHECK-NEXT: v_add_co_u32 v0, s4, v0, -8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s4, -1, v1, s4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB2_15 +; CHECK-NEXT: .LBB2_16: ; %Flow37 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p0_p4: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v9, 0 +; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 +; CHECK-NEXT: v_lshrrev_b64 v[6:7], 4, v[4:5] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB3_3 +; CHECK-NEXT: ; %bb.1: ; %Flow37 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB3_10 +; CHECK-NEXT: .LBB3_2: ; %Flow38 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB3_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB3_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v11, v3 +; CHECK-NEXT: v_mov_b32_e32 v13, v1 +; CHECK-NEXT: v_mov_b32_e32 v10, v2 +; CHECK-NEXT: v_mov_b32_e32 v12, v0 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB3_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx4 v[14:17], v[10:11], off +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 +; CHECK-NEXT: v_add_co_u32 v10, s5, v10, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, 0, v11, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[14:17] +; CHECK-NEXT: v_add_co_u32 v12, s6, v12, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s6, 0, v13, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB3_5 +; CHECK-NEXT: .LBB3_6: ; %Flow32 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_and_saveexec_b32 s8, s4 +; CHECK-NEXT: s_cbranch_execz .LBB3_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_sub_co_u32 v4, s5, v4, v8 +; CHECK-NEXT: v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v5, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v5, s5 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB3_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_ubyte v4, v[2:3], off +; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 +; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB3_8 +; CHECK-NEXT: .LBB3_9: ; %Flow30 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execz .LBB3_2 +; CHECK-NEXT: .LBB3_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB3_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4 +; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB3_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_ubyte v12, v[10:11], off +; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4 +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s8, s4, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[4:5], v12 +; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB3_12 +; CHECK-NEXT: .LBB3_13: ; %Flow36 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB3_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_lshlrev_b64 v[4:5], 4, v[6:7] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v4, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB3_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; CHECK-NEXT: v_add_co_u32 v0, s4, v0, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s4, -1, v1, s4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB3_15 +; CHECK-NEXT: .LBB3_16: ; %Flow34 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p0_p5: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v7, 15, v3 +; CHECK-NEXT: v_mov_b32_e32 v8, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; CHECK-NEXT: v_lshrrev_b64 v[5:6], 4, v[3:4] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8] +; CHECK-NEXT: v_cndmask_b32_e32 v9, -1, v0, vcc_lo +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6] +; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v9 +; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB4_3 +; CHECK-NEXT: ; %bb.1: ; %Flow40 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB4_10 +; CHECK-NEXT: .LBB4_2: ; %Flow41 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB4_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB4_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v10, v1 +; CHECK-NEXT: v_mov_b32_e32 v9, v0 +; CHECK-NEXT: v_mov_b32_e32 v11, v2 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB4_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v12, v11, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v13, v11, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v14, v11, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v15, v11, s[0:3], 0 offen offset:12 +; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s5, -1, v6, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v11, 16, v11 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[12:15] +; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB4_5 +; CHECK-NEXT: .LBB4_6: ; %Flow35 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_and_saveexec_b32 s8, s4 +; CHECK-NEXT: s_cbranch_execz .LBB4_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_and_b32_e32 v5, -16, v3 +; CHECK-NEXT: v_sub_co_u32 v3, s5, v3, v7 +; CHECK-NEXT: v_subrev_co_ci_u32_e64 v4, s5, 0, v4, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v4, s5 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB4_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen +; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s5, -1, v8, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 +; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB4_8 +; CHECK-NEXT: .LBB4_9: ; %Flow33 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 +; CHECK-NEXT: ; implicit-def: $vgpr2 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execz .LBB4_2 +; CHECK-NEXT: .LBB4_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB4_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v1, s4 +; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 +; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB4_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen +; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s4, -1, v8, s4 +; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8] +; CHECK-NEXT: s_or_b32 s8, s4, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[9:10], v11 +; CHECK-NEXT: v_add_co_u32 v9, s5, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, -1, v10, s5 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB4_12 +; CHECK-NEXT: .LBB4_13: ; %Flow39 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB4_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_lshlrev_b64 v[7:8], 4, v[5:6] +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v7, v0 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v8, v1, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB4_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: v_add_co_u32 v5, vcc_lo, v5, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v6, vcc_lo +; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; CHECK-NEXT: v_add_co_u32 v0, s4, v0, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s4, -1, v1, s4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB4_15 +; CHECK-NEXT: .LBB4_16: ; %Flow37 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + + +define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p1_p0: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v9, 0 +; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 +; CHECK-NEXT: v_lshrrev_b64 v[6:7], 4, v[4:5] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB5_3 +; CHECK-NEXT: ; %bb.1: ; %Flow41 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB5_10 +; CHECK-NEXT: .LBB5_2: ; %Flow42 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB5_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB5_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v11, v3 +; CHECK-NEXT: v_mov_b32_e32 v13, v1 +; CHECK-NEXT: v_mov_b32_e32 v10, v2 +; CHECK-NEXT: v_mov_b32_e32 v12, v0 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB5_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[10:11] +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 +; CHECK-NEXT: v_add_co_u32 v10, s5, v10, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, 0, v11, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[12:13], v[14:17], off +; CHECK-NEXT: v_add_co_u32 v12, s6, v12, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s6, 0, v13, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB5_5 +; CHECK-NEXT: .LBB5_6: ; %Flow36 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_and_saveexec_b32 s8, s4 +; CHECK-NEXT: s_cbranch_execz .LBB5_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_sub_co_u32 v4, s5, v4, v8 +; CHECK-NEXT: v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v5, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v5, s5 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB5_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_ubyte v4, v[2:3] +; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_byte v[0:1], v4, off +; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB5_8 +; CHECK-NEXT: .LBB5_9: ; %Flow34 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execz .LBB5_2 +; CHECK-NEXT: .LBB5_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB5_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4 +; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB5_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_ubyte v12, v[10:11] +; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4 +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s8, s4, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_byte v[4:5], v12, off +; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB5_12 +; CHECK-NEXT: .LBB5_13: ; %Flow40 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB5_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_lshlrev_b64 v[4:5], 4, v[6:7] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v4, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB5_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; CHECK-NEXT: v_add_co_u32 v0, s4, v0, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s4, -1, v1, s4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB5_15 +; CHECK-NEXT: .LBB5_16: ; %Flow38 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p1_p1: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v9, 0 +; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 +; CHECK-NEXT: v_lshrrev_b64 v[6:7], 4, v[4:5] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB6_3 +; CHECK-NEXT: ; %bb.1: ; %Flow47 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB6_10 +; CHECK-NEXT: .LBB6_2: ; %Flow48 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB6_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB6_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v11, v3 +; CHECK-NEXT: v_mov_b32_e32 v13, v1 +; CHECK-NEXT: v_mov_b32_e32 v10, v2 +; CHECK-NEXT: v_mov_b32_e32 v12, v0 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB6_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx4 v[14:17], v[10:11], off +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 +; CHECK-NEXT: v_add_co_u32 v10, s5, v10, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, 0, v11, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[12:13], v[14:17], off +; CHECK-NEXT: v_add_co_u32 v12, s6, v12, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s6, 0, v13, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB6_5 +; CHECK-NEXT: .LBB6_6: ; %Flow42 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_and_saveexec_b32 s8, s4 +; CHECK-NEXT: s_cbranch_execz .LBB6_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_sub_co_u32 v4, s5, v4, v8 +; CHECK-NEXT: v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v5, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v5, s5 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB6_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_ubyte v4, v[2:3], off +; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_byte v[0:1], v4, off +; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB6_8 +; CHECK-NEXT: .LBB6_9: ; %Flow40 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execz .LBB6_2 +; CHECK-NEXT: .LBB6_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB6_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4 +; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB6_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_ubyte v12, v[10:11], off +; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4 +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s8, s4, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_byte v[4:5], v12, off +; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB6_12 +; CHECK-NEXT: .LBB6_13: ; %Flow46 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB6_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_lshlrev_b64 v[4:5], 4, v[6:7] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v4, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB6_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; CHECK-NEXT: v_add_co_u32 v0, s4, v0, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s4, -1, v1, s4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB6_15 +; CHECK-NEXT: .LBB6_16: ; %Flow44 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p1_p3: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshrrev_b64 v[4:5], 3, v[3:4] +; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: s_mov_b32 s5, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB7_3 +; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: v_mov_b32_e32 v7, v1 +; CHECK-NEXT: v_mov_b32_e32 v6, v0 +; CHECK-NEXT: v_mov_b32_e32 v8, v2 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .LBB7_2: ; %loop-memcpy-expansion +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_b64 v[9:10], v8 +; CHECK-NEXT: s_add_u32 s6, s6, 1 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[6:7], v[4:5] +; CHECK-NEXT: s_or_b32 s8, vcc_lo, s8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx2 v[6:7], v[9:10], off +; CHECK-NEXT: v_add_co_u32 v6, s4, v6, 8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s4, 0, v7, s4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB7_2 +; CHECK-NEXT: .LBB7_3: ; %Flow11 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: v_and_b32_e32 v6, 7, v3 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[6:7] +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execz .LBB7_7 +; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: v_and_b32_e32 v8, -8, v3 +; CHECK-NEXT: v_lshlrev_b64 v[3:4], 3, v[4:5] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v8 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; CHECK-NEXT: .LBB7_5: ; %loop-memcpy-residual +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_u8 v5, v2 +; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4 +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_byte v[3:4], v5, off +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB7_5 +; CHECK-NEXT: ; %bb.6: ; %Flow +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: .LBB7_7: ; %Flow9 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p1_p4: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v9, 0 +; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 +; CHECK-NEXT: v_lshrrev_b64 v[6:7], 4, v[4:5] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] +; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB8_3 +; CHECK-NEXT: ; %bb.1: ; %Flow41 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB8_10 +; CHECK-NEXT: .LBB8_2: ; %Flow42 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB8_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB8_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v11, v3 +; CHECK-NEXT: v_mov_b32_e32 v13, v1 +; CHECK-NEXT: v_mov_b32_e32 v10, v2 +; CHECK-NEXT: v_mov_b32_e32 v12, v0 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB8_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx4 v[14:17], v[10:11], off +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 +; CHECK-NEXT: v_add_co_u32 v10, s5, v10, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, 0, v11, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[12:13], v[14:17], off +; CHECK-NEXT: v_add_co_u32 v12, s6, v12, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s6, 0, v13, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB8_5 +; CHECK-NEXT: .LBB8_6: ; %Flow36 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_and_saveexec_b32 s8, s4 +; CHECK-NEXT: s_cbranch_execz .LBB8_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_sub_co_u32 v4, s5, v4, v8 +; CHECK-NEXT: v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v5, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v5, s5 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB8_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_ubyte v4, v[2:3], off +; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5 +; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s9, s5, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_byte v[0:1], v4, off +; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB8_8 +; CHECK-NEXT: .LBB8_9: ; %Flow34 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execz .LBB8_2 +; CHECK-NEXT: .LBB8_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB8_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4 +; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB8_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_ubyte v12, v[10:11], off +; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4 +; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4 +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] +; CHECK-NEXT: s_or_b32 s8, s4, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_byte v[4:5], v12, off +; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB8_12 +; CHECK-NEXT: .LBB8_13: ; %Flow40 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB8_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_lshlrev_b64 v[4:5], 4, v[6:7] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v4, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB8_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; CHECK-NEXT: v_add_co_u32 v0, s4, v0, -16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s4, -1, v1, s4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB8_15 +; CHECK-NEXT: .LBB8_16: ; %Flow38 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p1_p5: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshrrev_b64 v[4:5], 4, v[3:4] +; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: s_mov_b32 s5, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB9_3 +; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: v_mov_b32_e32 v7, v1 +; CHECK-NEXT: v_mov_b32_e32 v6, v0 +; CHECK-NEXT: v_mov_b32_e32 v8, v2 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB9_2: ; %loop-memcpy-expansion +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v9, v8, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v10, v8, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v11, v8, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v12, v8, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_add_u32 s6, s6, 1 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[6:7], v[4:5] +; CHECK-NEXT: s_or_b32 s8, vcc_lo, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[6:7], v[9:12], off +; CHECK-NEXT: v_add_co_u32 v6, s4, v6, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s4, 0, v7, s4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB9_2 +; CHECK-NEXT: .LBB9_3: ; %Flow11 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: v_and_b32_e32 v6, 15, v3 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[6:7] +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execz .LBB9_7 +; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: v_and_b32_e32 v8, -16, v3 +; CHECK-NEXT: v_lshlrev_b64 v[3:4], 4, v[4:5] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v8 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; CHECK-NEXT: .LBB9_5: ; %loop-memcpy-residual +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen +; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4 +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_byte v[3:4], v5, off +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB9_5 +; CHECK-NEXT: ; %bb.6: ; %Flow +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: .LBB9_7: ; %Flow9 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + + +define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p3_p0: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v7, 7, v3 +; CHECK-NEXT: v_mov_b32_e32 v8, 0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base +; CHECK-NEXT: v_lshrrev_b64 v[5:6], 3, v[3:4] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8] +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo +; CHECK-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc_lo +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6] +; CHECK-NEXT: v_cmpx_ge_u64_e64 v[1:2], v[9:10] +; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB10_3 +; CHECK-NEXT: ; %bb.1: ; %Flow40 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB10_10 +; CHECK-NEXT: .LBB10_2: ; %Flow41 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB10_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB10_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v10, v2 +; CHECK-NEXT: v_mov_b32_e32 v9, v1 +; CHECK-NEXT: v_mov_b32_e32 v11, v0 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB10_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[9:10] +; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s5, -1, v6, s5 +; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 8 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[5:6] +; CHECK-NEXT: s_or_b32 s9, s6, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: ds_write_b64 v11, v[12:13] +; CHECK-NEXT: v_add_nc_u32_e32 v11, 8, v11 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB10_5 +; CHECK-NEXT: .LBB10_6: ; %Flow35 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_and_saveexec_b32 s8, s4 +; CHECK-NEXT: s_cbranch_execz .LBB10_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_and_b32_e32 v5, -8, v3 +; CHECK-NEXT: v_sub_co_u32 v6, s5, v3, v7 +; CHECK-NEXT: v_subrev_co_ci_u32_e64 v4, s5, 0, v4, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v6 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v2, v4, s5 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB10_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_ubyte v2, v[0:1] +; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s5, -1, v8, s5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, 0, v1, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[7:8] +; CHECK-NEXT: s_or_b32 s9, s6, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: ds_write_b8 v3, v2 +; CHECK-NEXT: v_add_nc_u32_e32 v3, 1, v3 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB10_8 +; CHECK-NEXT: .LBB10_9: ; %Flow33 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2 +; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execz .LBB10_2 +; CHECK-NEXT: .LBB10_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB10_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v2, s4 +; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 +; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB10_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_ubyte v11, v[9:10] +; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s4, -1, v8, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8] +; CHECK-NEXT: s_or_b32 s8, s5, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: ds_write_b8 v4, v11 +; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB10_12 +; CHECK-NEXT: .LBB10_13: ; %Flow39 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB10_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_lshlrev_b64 v[7:8], 3, v[5:6] +; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v7, v1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v8, v2, vcc_lo +; CHECK-NEXT: v_add3_u32 v2, v3, v0, -8 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v1, -8 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo +; CHECK-NEXT: .LBB10_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_dwordx2 v[3:4], v[0:1] +; CHECK-NEXT: v_add_co_u32 v5, vcc_lo, v5, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v6, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -8 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[5:6] +; CHECK-NEXT: s_or_b32 s7, s4, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: ds_write_b64 v2, v[3:4] +; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB10_15 +; CHECK-NEXT: .LBB10_16: ; %Flow37 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p3_p1: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshrrev_b64 v[4:5], 3, v[3:4] +; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: s_mov_b32 s5, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB11_3 +; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: v_mov_b32_e32 v7, v2 +; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: v_mov_b32_e32 v8, v0 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .LBB11_2: ; %loop-memcpy-expansion +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx2 v[9:10], v[6:7], off +; CHECK-NEXT: s_add_u32 s6, s6, 1 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, 8 +; CHECK-NEXT: v_cmp_ge_u64_e64 s4, s[6:7], v[4:5] +; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; CHECK-NEXT: s_or_b32 s8, s4, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v8, v[9:10] +; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB11_2 +; CHECK-NEXT: .LBB11_3: ; %Flow11 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: v_and_b32_e32 v6, 7, v3 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[6:7] +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execz .LBB11_7 +; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: v_and_b32_e32 v8, -8, v3 +; CHECK-NEXT: v_lshlrev_b64 v[3:4], 3, v[4:5] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v8 +; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 +; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo +; CHECK-NEXT: .LBB11_5: ; %loop-memcpy-residual +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7] +; CHECK-NEXT: global_load_ubyte v3, v[3:4], off +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b8 v0, v3 +; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB11_5 +; CHECK-NEXT: ; %bb.6: ; %Flow +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: .LBB11_7: ; %Flow9 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p3_p3: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v6, 7, v2 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_lshrrev_b64 v[4:5], 3, v[2:3] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; CHECK-NEXT: v_cmpx_ge_u32_e64 v1, v0 +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB12_3 +; CHECK-NEXT: ; %bb.1: ; %Flow44 +; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB12_10 +; CHECK-NEXT: .LBB12_2: ; %Flow45 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB12_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB12_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: v_mov_b32_e32 v8, v0 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .LBB12_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_b64 v[9:10], v3 +; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v3, 8, v3 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[4:5] +; CHECK-NEXT: s_or_b32 s8, s5, s8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: ds_write_b64 v8, v[9:10] +; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB12_5 +; CHECK-NEXT: .LBB12_6: ; %Flow39 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB12_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_and_b32_e32 v2, -8, v2 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; CHECK-NEXT: .LBB12_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_u8 v2, v1 +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s8, s5, s8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: ds_write_b8 v0, v2 +; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB12_8 +; CHECK-NEXT: .LBB12_9: ; %Flow37 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 +; CHECK-NEXT: s_cbranch_execz .LBB12_2 +; CHECK-NEXT: .LBB12_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s6, s4 +; CHECK-NEXT: s_cbranch_execz .LBB12_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_nc_u32_e32 v8, -1, v2 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v8 +; CHECK-NEXT: v_add_nc_u32_e32 v8, v1, v8 +; CHECK-NEXT: .LBB12_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_u8 v9, v8 +; CHECK-NEXT: v_add_co_u32 v6, s4, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s4, -1, v7, s4 +; CHECK-NEXT: v_add_nc_u32_e32 v8, -1, v8 +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s7, s4, s7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: ds_write_b8 v3, v9 +; CHECK-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB12_12 +; CHECK-NEXT: .LBB12_13: ; %Flow43 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB12_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_and_b32_e32 v2, -8, v2 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; CHECK-NEXT: .LBB12_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_b64 v[2:3], v1 +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo +; CHECK-NEXT: v_add_nc_u32_e32 v1, -8, v1 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] +; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: ds_write_b64 v0, v[2:3] +; CHECK-NEXT: v_add_nc_u32_e32 v0, -8, v0 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB12_15 +; CHECK-NEXT: .LBB12_16: ; %Flow41 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p3_p4: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshrrev_b64 v[4:5], 3, v[3:4] +; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: s_mov_b32 s5, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB13_3 +; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: v_mov_b32_e32 v7, v2 +; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: v_mov_b32_e32 v8, v0 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .LBB13_2: ; %loop-memcpy-expansion +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx2 v[9:10], v[6:7], off +; CHECK-NEXT: s_add_u32 s6, s6, 1 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, 8 +; CHECK-NEXT: v_cmp_ge_u64_e64 s4, s[6:7], v[4:5] +; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; CHECK-NEXT: s_or_b32 s8, s4, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v8, v[9:10] +; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB13_2 +; CHECK-NEXT: .LBB13_3: ; %Flow11 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: v_and_b32_e32 v6, 7, v3 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[6:7] +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execz .LBB13_7 +; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: v_and_b32_e32 v8, -8, v3 +; CHECK-NEXT: v_lshlrev_b64 v[3:4], 3, v[4:5] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v8 +; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 +; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo +; CHECK-NEXT: .LBB13_5: ; %loop-memcpy-residual +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7] +; CHECK-NEXT: global_load_ubyte v3, v[3:4], off +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b8 v0, v3 +; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB13_5 +; CHECK-NEXT: ; %bb.6: ; %Flow +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: .LBB13_7: ; %Flow9 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p3_p5: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshrrev_b64 v[3:4], 3, v[2:3] +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[3:4] +; CHECK-NEXT: s_cbranch_execz .LBB14_3 +; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: v_mov_b32_e32 v6, v0 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .LBB14_2: ; %loop-memcpy-expansion +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: buffer_load_dword v7, v5, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v5, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[3:4] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v6, v[7:8] +; CHECK-NEXT: v_add_nc_u32_e32 v6, 8, v6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB14_2 +; CHECK-NEXT: .LBB14_3: ; %Flow14 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: v_and_b32_e32 v3, 7, v2 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[3:4] +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execz .LBB14_7 +; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: v_and_b32_e32 v2, -8, v2 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; CHECK-NEXT: .LBB14_5: ; %loop-memcpy-residual +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[3:4] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b8 v0, v2 +; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB14_5 +; CHECK-NEXT: ; %bb.6: ; %Flow +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: .LBB14_7: ; %Flow12 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + + +define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p5_p0: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v7, 15, v3 +; CHECK-NEXT: v_mov_b32_e32 v8, 0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base +; CHECK-NEXT: v_lshrrev_b64 v[5:6], 4, v[3:4] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8] +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo +; CHECK-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc_lo +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6] +; CHECK-NEXT: v_cmpx_ge_u64_e64 v[1:2], v[9:10] +; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB15_3 +; CHECK-NEXT: ; %bb.1: ; %Flow40 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB15_10 +; CHECK-NEXT: .LBB15_2: ; %Flow41 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB15_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s6, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB15_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v10, v2 +; CHECK-NEXT: v_mov_b32_e32 v9, v1 +; CHECK-NEXT: v_mov_b32_e32 v11, v0 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB15_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[9:10] +; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s5, -1, v6, s5 +; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v15, v11, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v14, v11, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v11, 16, v11 +; CHECK-NEXT: s_or_b32 s8, s5, s8 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB15_5 +; CHECK-NEXT: .LBB15_6: ; %Flow35 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_and_saveexec_b32 s8, s4 +; CHECK-NEXT: s_cbranch_execz .LBB15_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_and_b32_e32 v5, -16, v3 +; CHECK-NEXT: v_sub_co_u32 v6, s5, v3, v7 +; CHECK-NEXT: v_subrev_co_ci_u32_e64 v4, s5, 0, v4, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v6 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v2, v4, s5 +; CHECK-NEXT: s_mov_b32 s9, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB15_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_ubyte v2, v[0:1] +; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s5, -1, v8, s5 +; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, 0, v1, s5 +; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[7:8] +; CHECK-NEXT: s_or_b32 s9, s6, s9 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v3, 1, v3 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 +; CHECK-NEXT: s_cbranch_execnz .LBB15_8 +; CHECK-NEXT: .LBB15_9: ; %Flow33 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2 +; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 +; CHECK-NEXT: s_cbranch_execz .LBB15_2 +; CHECK-NEXT: .LBB15_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB15_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v2, s4 +; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 +; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB15_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_ubyte v11, v[9:10] +; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s4, -1, v8, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8] +; CHECK-NEXT: s_or_b32 s8, s5, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: buffer_store_byte v11, v4, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB15_12 +; CHECK-NEXT: .LBB15_13: ; %Flow39 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB15_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_lshlrev_b64 v[7:8], 4, v[5:6] +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 +; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v7, v1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v8, v2, vcc_lo +; CHECK-NEXT: v_add3_u32 v2, v3, v0, -16 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v1, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB15_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[0:1] +; CHECK-NEXT: v_add_co_u32 v5, vcc_lo, v5, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v6, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 +; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_cbranch_execnz .LBB15_15 +; CHECK-NEXT: .LBB15_16: ; %Flow37 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p5_p1: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshrrev_b64 v[4:5], 4, v[3:4] +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB16_3 +; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: v_mov_b32_e32 v7, v2 +; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: v_mov_b32_e32 v8, v0 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB16_2: ; %loop-memcpy-expansion +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx4 v[9:12], v[6:7], off +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, 16 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB16_2 +; CHECK-NEXT: .LBB16_3: ; %Flow11 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: v_and_b32_e32 v6, 15, v3 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[6:7] +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execz .LBB16_7 +; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: v_and_b32_e32 v8, -16, v3 +; CHECK-NEXT: v_lshlrev_b64 v[3:4], 4, v[4:5] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v8 +; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 +; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo +; CHECK-NEXT: .LBB16_5: ; %loop-memcpy-residual +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7] +; CHECK-NEXT: global_load_ubyte v3, v[3:4], off +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB16_5 +; CHECK-NEXT: ; %bb.6: ; %Flow +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: .LBB16_7: ; %Flow9 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p5_p3: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshrrev_b64 v[3:4], 3, v[2:3] +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[3:4] +; CHECK-NEXT: s_cbranch_execz .LBB17_3 +; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: v_mov_b32_e32 v6, v0 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .LBB17_2: ; %loop-memcpy-expansion +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_b64 v[7:8], v5 +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[3:4] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v6, 8, v6 +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB17_2 +; CHECK-NEXT: .LBB17_3: ; %Flow14 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: v_and_b32_e32 v3, 7, v2 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[3:4] +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execz .LBB17_7 +; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: v_and_b32_e32 v2, -8, v2 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; CHECK-NEXT: .LBB17_5: ; %loop-memcpy-residual +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ds_read_u8 v2, v1 +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[3:4] +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB17_5 +; CHECK-NEXT: ; %bb.6: ; %Flow +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: .LBB17_7: ; %Flow12 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p5_p4: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshrrev_b64 v[4:5], 4, v[3:4] +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB18_3 +; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: v_mov_b32_e32 v7, v2 +; CHECK-NEXT: v_mov_b32_e32 v6, v1 +; CHECK-NEXT: v_mov_b32_e32 v8, v0 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB18_2: ; %loop-memcpy-expansion +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: global_load_dwordx4 v[9:12], v[6:7], off +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, 16 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB18_2 +; CHECK-NEXT: .LBB18_3: ; %Flow11 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: v_and_b32_e32 v6, 15, v3 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[6:7] +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execz .LBB18_7 +; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: v_and_b32_e32 v8, -16, v3 +; CHECK-NEXT: v_lshlrev_b64 v[3:4], 4, v[4:5] +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v8 +; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 +; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo +; CHECK-NEXT: .LBB18_5: ; %loop-memcpy-residual +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 +; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7] +; CHECK-NEXT: global_load_ubyte v3, v[3:4], off +; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB18_5 +; CHECK-NEXT: ; %bb.6: ; %Flow +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: .LBB18_7: ; %Flow9 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + +define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { +; CHECK-LABEL: memmove_p5_p5: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v6, 15, v2 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_lshrrev_b64 v[4:5], 4, v[2:3] +; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; CHECK-NEXT: v_cmpx_ge_u32_e64 v1, v0 +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB19_3 +; CHECK-NEXT: ; %bb.1: ; %Flow44 +; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB19_10 +; CHECK-NEXT: .LBB19_2: ; %Flow45 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB19_3: ; %memmove_copy_forward +; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB19_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader +; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: v_mov_b32_e32 v8, v0 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB19_5: ; %memmove_fwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v9, v3, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v10, v3, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen +; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[4:5] +; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 +; CHECK-NEXT: s_or_b32 s8, s5, s8 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB19_5 +; CHECK-NEXT: .LBB19_6: ; %Flow39 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_and_saveexec_b32 s7, s4 +; CHECK-NEXT: s_cbranch_execz .LBB19_9 +; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader +; CHECK-NEXT: v_and_b32_e32 v2, -16, v2 +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; CHECK-NEXT: .LBB19_8: ; %memmove_fwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s8, s5, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_execnz .LBB19_8 +; CHECK-NEXT: .LBB19_9: ; %Flow37 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 +; CHECK-NEXT: s_cbranch_execz .LBB19_2 +; CHECK-NEXT: .LBB19_10: ; %memmove_copy_backwards +; CHECK-NEXT: s_and_saveexec_b32 s6, s4 +; CHECK-NEXT: s_cbranch_execz .LBB19_13 +; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader +; CHECK-NEXT: v_add_nc_u32_e32 v8, -1, v2 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v8 +; CHECK-NEXT: v_add_nc_u32_e32 v8, v1, v8 +; CHECK-NEXT: .LBB19_12: ; %memmove_bwd_residual_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: buffer_load_ubyte v9, v8, s[0:3], 0 offen +; CHECK-NEXT: v_add_co_u32 v6, s4, v6, -1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s4, -1, v7, s4 +; CHECK-NEXT: v_add_nc_u32_e32 v8, -1, v8 +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[6:7] +; CHECK-NEXT: s_or_b32 s7, s4, s7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_byte v9, v3, s[0:3], 0 offen +; CHECK-NEXT: v_add_nc_u32_e32 v3, -1, v3 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 +; CHECK-NEXT: s_cbranch_execnz .LBB19_12 +; CHECK-NEXT: .LBB19_13: ; %Flow43 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB19_16 +; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader +; CHECK-NEXT: v_and_b32_e32 v2, -16, v2 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; CHECK-NEXT: .p2align 6 +; CHECK-NEXT: .LBB19_15: ; %memmove_bwd_main_loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen +; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v4, -1 +; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo +; CHECK-NEXT: v_add_nc_u32_e32 v1, -16, v1 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] +; CHECK-NEXT: v_add_nc_u32_e32 v0, -16, v0 +; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 +; CHECK-NEXT: s_cbranch_execnz .LBB19_15 +; CHECK-NEXT: .LBB19_16: ; %Flow41 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) + ret void +} + + +declare void @llvm.memmove.p0.p0.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p0.p1.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p0.p3.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p0.p4.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p1.p0.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p1.p3.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p1.p4.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p3.p0.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p3.p1.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p3.p3.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p3.p4.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p3.p5.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p5.p0.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p5.p3.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p5.p4.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 + +attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }