diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 5bfd8914b9a46..09f7877b13b3a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { return 1024; } -// FIXME: Should we use narrower types for local/region, or account for when -// unaligned access is legal? Type *GCNTTIImpl::getMemcpyLoopLoweringType( LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, @@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType( if (AtomicElementSize) return Type::getIntNTy(Context, *AtomicElementSize * 8); - Align MinAlign = std::min(SrcAlign, DestAlign); - - // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the - // hardware into byte accesses. If you assume all alignments are equally - // probable, it's more efficient on average to use short accesses for this - // case. - if (MinAlign == Align(2)) - return Type::getInt16Ty(Context); - - // Not all subtargets have 128-bit DS instructions, and we currently don't - // form them by default. - if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS || - SrcAddrSpace == AMDGPUAS::REGION_ADDRESS || - DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS || - DestAddrSpace == AMDGPUAS::REGION_ADDRESS) { - return FixedVectorType::get(Type::getInt32Ty(Context), 2); - } - - // Global memory works best with 16-byte accesses. + // 16-byte accesses achieve the highest copy throughput. // If the operation has a fixed known length that is large enough, it is // worthwhile to return an even wider type and let legalization lower it into - // multiple accesses, effectively unrolling the memcpy loop. Private memory - // also hits this, although accesses may be decomposed. + // multiple accesses, effectively unrolling the memcpy loop. + // We also rely on legalization to decompose into smaller accesses for + // subtargets and address spaces where it is necessary. // // Don't unroll if Length is not a constant, since unrolling leads to worse // performance for length values that are smaller or slightly larger than the @@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType( OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign, DestAlign, AtomicCpySize); - Align MinAlign = std::min(SrcAlign, DestAlign); - - if (MinAlign != Align(2)) { - Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4); - while (RemainingBytes >= 16) { - OpsOut.push_back(I32x4Ty); - RemainingBytes -= 16; - } + Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4); + while (RemainingBytes >= 16) { + OpsOut.push_back(I32x4Ty); + RemainingBytes -= 16; + } - Type *I64Ty = Type::getInt64Ty(Context); - while (RemainingBytes >= 8) { - OpsOut.push_back(I64Ty); - RemainingBytes -= 8; - } + Type *I64Ty = Type::getInt64Ty(Context); + while (RemainingBytes >= 8) { + OpsOut.push_back(I64Ty); + RemainingBytes -= 8; + } - Type *I32Ty = Type::getInt32Ty(Context); - while (RemainingBytes >= 4) { - OpsOut.push_back(I32Ty); - RemainingBytes -= 4; - } + Type *I32Ty = Type::getInt32Ty(Context); + while (RemainingBytes >= 4) { + OpsOut.push_back(I32Ty); + RemainingBytes -= 4; } Type *I16Ty = Type::getInt16Ty(Context); diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll index ffe9e06c04ae4..5a9f53ec0077d 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -330,17 +330,17 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_alt_type( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -681,13 +681,25 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1038 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 +; OPT-NEXT: [[TMP16:%.*]] = load i64, ptr addrspace(1) [[TMP15]], align 2 +; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 +; OPT-NEXT: store i64 [[TMP16]], ptr addrspace(1) [[TMP17]], align 2 +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032 +; OPT-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 2 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032 +; OPT-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2 +; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036 +; OPT-NEXT: [[TMP13:%.*]] = load i16, ptr addrspace(1) [[TMP12]], align 2 +; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036 +; OPT-NEXT: store i16 [[TMP13]], ptr addrspace(1) [[TMP14]], align 2 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038 @@ -731,13 +743,17 @@ define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 +; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 +; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026 @@ -754,13 +770,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024 +; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 2 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024 +; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026 @@ -804,13 +824,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspa ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 +; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 +; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 4 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 +; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 @@ -854,13 +878,17 @@ define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspa ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4 +; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 +; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 +; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 4 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 @@ -904,13 +932,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa ; OPT: load-store-loop: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2 +; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 -; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2 -; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026 +; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2 +; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 +; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT: memcpy-split: +; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024 +; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2 +; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024 +; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026 @@ -958,17 +990,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 { ; OPT-LABEL: @memcpy_global_align2_global_align2_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 1 +; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(1) [[TMP5]], align 2 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 2 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[TMP7]], align 2 -; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 2 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 2 +; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1028,17 +1060,17 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_local_align4_local_align4_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1063,17 +1095,17 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_local_align2_local_align2_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 1 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(3) [[TMP5]], align 2 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 2 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(3) [[TMP7]], align 2 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 2 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 2 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1098,17 +1130,17 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_local_align1_local_align1_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1133,17 +1165,17 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_local_align4_global_align4_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP5]], align 4 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1168,17 +1200,17 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 { ; OPT-LABEL: @memcpy_global_align4_local_align4_variable( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]] ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; OPT: loop-memcpy-expansion: ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4 +; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]] -; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 -; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 +; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; OPT: loop-memcpy-residual: @@ -1693,10 +1725,10 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3) ; ALL: load-store-loop: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]] +; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]] ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]] -; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8 +; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]] +; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; ALL: memcpy-split: @@ -1708,17 +1740,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3) define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) { ; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size( -; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; MAX1024: loop-memcpy-expansion: ; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]] +; MAX1024-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]] ; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]] -; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; MAX1024-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]] +; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; MAX1024: loop-memcpy-residual: @@ -1738,17 +1770,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr ; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; ; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size( -; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; ALL: loop-memcpy-expansion: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]] +; ALL-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]] -; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; ALL-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]] +; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; ALL: loop-memcpy-residual: @@ -1781,10 +1813,10 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5) ; ALL: load-store-loop: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]] +; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]] ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]] -; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8 +; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]] +; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; ALL: memcpy-split: @@ -1796,17 +1828,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5) define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) { ; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size( -; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; MAX1024: loop-memcpy-expansion: ; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]] +; MAX1024-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]] ; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]] -; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; MAX1024-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]] +; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; MAX1024: loop-memcpy-residual: @@ -1826,17 +1858,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr ; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; ; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size( -; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]] ; ALL: loop-memcpy-expansion: ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]] +; ALL-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]] -; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]] -; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8 +; ALL-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]] +; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16 ; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]] ; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]] ; ALL: loop-memcpy-residual: @@ -1871,20 +1903,20 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8 +; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 256 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP6]], align 1 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1 -; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1 +; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -1896,7 +1928,7 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) { ; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0 @@ -1918,11 +1950,11 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] ; OPT: memmove_bwd_main_loop: ; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] -; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8 +; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 16 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1 +; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP10]], align 1 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1 +; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1 ; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0 ; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] ; OPT: memmove_copy_forward: @@ -1930,10 +1962,10 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add ; OPT: memmove_fwd_main_loop: ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1 +; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP13]], align 1 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1 -; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1 +; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 16 ; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] ; OPT: memmove_fwd_middle: @@ -1965,20 +1997,20 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8 +; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 256 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1 -; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1 +; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -1990,7 +2022,7 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) { ; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0 @@ -2012,11 +2044,11 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] ; OPT: memmove_bwd_main_loop: ; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] -; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8 +; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 16 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1 +; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1 +; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1 ; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0 ; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] ; OPT: memmove_copy_forward: @@ -2024,10 +2056,10 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add ; OPT: memmove_fwd_main_loop: ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1 +; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 1 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1 -; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1 +; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 16 ; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] ; OPT: memmove_fwd_middle: @@ -2058,20 +2090,20 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]] ; ALL: memmove_bwd_loop: ; ALL-NEXT: [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ] -; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 8 +; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 256 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]] -; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1 +; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP2]], align 1 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1 +; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1 ; ALL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0 ; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]] ; ALL: memmove_fwd_loop: ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ] ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]] -; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1 +; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP5]], align 1 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]] -; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1 -; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 8 +; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1 +; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 256 ; ALL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256 ; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]] ; ALL: memmove_done: @@ -2083,7 +2115,7 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) { ; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size( -; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7 +; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]] ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0 @@ -2104,11 +2136,11 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]] ; OPT: memmove_bwd_main_loop: ; OPT-NEXT: [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ] -; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 8 +; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 16 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1 +; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP9]], align 1 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1 +; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1 ; OPT-NEXT: [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0 ; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]] ; OPT: memmove_copy_forward: @@ -2116,10 +2148,10 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad ; OPT: memmove_fwd_main_loop: ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ] ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1 +; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP12]], align 1 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]] -; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1 -; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 8 +; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1 +; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 16 ; OPT-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP3]] ; OPT-NEXT: br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]] ; OPT: memmove_fwd_middle: diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index a68d2e575607d..bc8bcc622810f 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -306,10 +306,10 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-LABEL: memmove_p0_p3: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v7, 7, v3 +; CHECK-NEXT: v_and_b32_e32 v7, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; CHECK-NEXT: v_and_b32_e32 v5, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v5, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8] @@ -338,15 +338,15 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[13:14], v4 -; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -8 +; CHECK-NEXT: ds_read_b128 v[13:16], v4 +; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5 -; CHECK-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[9:10], v[13:14] -; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 8 +; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16] +; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB2_5 @@ -355,7 +355,7 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB2_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v6, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 @@ -414,26 +414,26 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB2_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 -; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -8 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; CHECK-NEXT: v_add3_u32 v2, v3, v2, -8 +; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[3:4], v2 -; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v5, -8 -; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v6, vcc_lo -; CHECK-NEXT: v_add_co_u32 v9, vcc_lo, v0, v5 -; CHECK-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v1, v6, vcc_lo -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8] -; CHECK-NEXT: v_mov_b32_e32 v5, v7 -; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2 -; CHECK-NEXT: v_mov_b32_e32 v6, v8 +; CHECK-NEXT: ds_read_b128 v[7:10], v2 +; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo +; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5 +; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4] +; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 +; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: s_or_b32 s7, s4, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[9:10], v[3:4] +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB2_15 ; CHECK-NEXT: .LBB2_16: ; %Flow36 @@ -1043,9 +1043,9 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-LABEL: memmove_p1_p3: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v7, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 -; CHECK-NEXT: v_and_b32_e32 v5, 7, v3 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo @@ -1056,16 +1056,16 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .LBB7_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[10:11], v9 -; CHECK-NEXT: v_add_co_u32 v12, vcc_lo, v0, s4 -; CHECK-NEXT: s_add_u32 s4, s4, 8 -; CHECK-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v1, vcc_lo +; CHECK-NEXT: ds_read_b128 v[10:13], v9 +; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4 +; CHECK-NEXT: s_add_u32 s4, s4, 16 +; CHECK-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9 +; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx2 v[12:13], v[10:11], off +; CHECK-NEXT: global_store_dwordx4 v[14:15], v[10:13], off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB7_2 ; CHECK-NEXT: .LBB7_3: ; %Flow9 @@ -1076,7 +1076,7 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB7_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 @@ -1327,11 +1327,11 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-LABEL: memmove_p3_p0: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v5, 7, v3 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base -; CHECK-NEXT: v_and_b32_e32 v7, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[5:6] ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo @@ -1361,16 +1361,16 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: flat_load_dwordx2 v[13:14], v[9:10] -; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -8 +; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10] +; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5 -; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 8 +; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[11:12] ; CHECK-NEXT: s_or_b32 s9, s6, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write_b64 v4, v[13:14] -; CHECK-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; CHECK-NEXT: ds_write_b128 v4, v[13:16] +; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB10_5 ; CHECK-NEXT: .LBB10_6: ; %Flow34 @@ -1378,7 +1378,7 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB10_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3 ; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7 @@ -1437,23 +1437,23 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB10_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 -; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -8 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 +; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16 ; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo -; CHECK-NEXT: v_add3_u32 v0, v3, v0, -8 +; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .LBB10_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7 ; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo -; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -8 +; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16 ; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo -; CHECK-NEXT: flat_load_dwordx2 v[3:4], v[3:4] +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4] ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8] ; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write_b64 v0, v[3:4] -; CHECK-NEXT: v_add_nc_u32_e32 v0, -8, v0 +; CHECK-NEXT: ds_write_b128 v0, v[3:6] +; CHECK-NEXT: v_add_nc_u32_e32 v0, -16, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB10_15 ; CHECK-NEXT: .LBB10_16: ; %Flow36 @@ -1470,9 +1470,9 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-LABEL: memmove_p3_p1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v7, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 -; CHECK-NEXT: v_and_b32_e32 v5, 7, v3 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo @@ -1485,14 +1485,14 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo -; CHECK-NEXT: s_add_u32 s4, s4, 8 +; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] -; CHECK-NEXT: global_load_dwordx2 v[10:11], v[10:11], off +; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b64 v9, v[10:11] -; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9 +; CHECK-NEXT: ds_write_b128 v9, v[10:13] +; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB11_2 ; CHECK-NEXT: .LBB11_3: ; %Flow9 @@ -1503,7 +1503,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB11_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 @@ -1538,8 +1538,8 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v5, 0 -; CHECK-NEXT: v_and_b32_e32 v4, 7, v2 -; CHECK-NEXT: v_and_b32_e32 v6, -8, v2 +; CHECK-NEXT: v_and_b32_e32 v4, 15, v2 +; CHECK-NEXT: v_and_b32_e32 v6, -16, v2 ; CHECK-NEXT: v_mov_b32_e32 v7, v3 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[4:5] @@ -1563,15 +1563,15 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .LBB12_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[9:10], v3 -; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -8 +; CHECK-NEXT: ds_read_b128 v[9:12], v3 +; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5 -; CHECK-NEXT: v_add_nc_u32_e32 v3, 8, v3 +; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: ds_write_b64 v8, v[9:10] -; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8 +; CHECK-NEXT: ds_write_b128 v8, v[9:12] +; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB12_5 ; CHECK-NEXT: .LBB12_6: ; %Flow41 @@ -1579,7 +1579,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB12_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v2, -8, v2 +; CHECK-NEXT: v_and_b32_e32 v2, -16, v2 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 @@ -1630,24 +1630,24 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB12_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader -; CHECK-NEXT: v_and_b32_e32 v5, -8, v2 +; CHECK-NEXT: v_and_b32_e32 v5, -16, v2 ; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v4, -8, v5 +; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4 ; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5 ; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4 ; CHECK-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo ; CHECK-NEXT: .LBB12_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[5:6], v4 -; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 8 +; CHECK-NEXT: ds_read_b128 v[5:8], v4 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; CHECK-NEXT: v_add_nc_u32_e32 v4, -8, v4 +; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4 ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: ds_write_b64 v2, v[5:6] -; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2 +; CHECK-NEXT: ds_write_b128 v2, v[5:8] +; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB12_15 ; CHECK-NEXT: .LBB12_16: ; %Flow43 @@ -1664,9 +1664,9 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-LABEL: memmove_p3_p4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v7, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 -; CHECK-NEXT: v_and_b32_e32 v5, 7, v3 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo @@ -1679,14 +1679,14 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo -; CHECK-NEXT: s_add_u32 s4, s4, 8 +; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] -; CHECK-NEXT: global_load_dwordx2 v[10:11], v[10:11], off +; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b64 v9, v[10:11] -; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9 +; CHECK-NEXT: ds_write_b128 v9, v[10:13] +; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB13_2 ; CHECK-NEXT: .LBB13_3: ; %Flow9 @@ -1697,7 +1697,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB13_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_and_b32_e32 v3, -8, v3 +; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 @@ -1735,27 +1735,30 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo -; CHECK-NEXT: v_and_b32_e32 v2, -8, v4 -; CHECK-NEXT: v_and_b32_e32 v5, 7, v4 +; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v4 ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB14_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v7, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB14_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_clause 0x3 ; CHECK-NEXT: buffer_load_dword v9, v7, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v10, v7, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_add_u32 s4, s4, 8 +; CHECK-NEXT: buffer_load_dword v11, v7, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v12, v7, s[0:3], 0 offen offset:12 +; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v7, 8, v7 +; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3] ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b64 v8, v[9:10] -; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8 +; CHECK-NEXT: ds_write_b128 v8, v[9:12] +; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB14_2 ; CHECK-NEXT: .LBB14_3: ; %Flow14 @@ -1766,7 +1769,7 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB14_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_and_b32_e32 v2, -8, v4 +; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 @@ -2021,25 +2024,28 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo -; CHECK-NEXT: v_and_b32_e32 v2, -8, v4 -; CHECK-NEXT: v_and_b32_e32 v5, 7, v4 +; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 +; CHECK-NEXT: v_and_b32_e32 v5, 15, v4 ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB17_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v7, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB17_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ds_read_b64 v[9:10], v7 -; CHECK-NEXT: s_add_u32 s4, s4, 8 +; CHECK-NEXT: ds_read_b128 v[9:12], v7 +; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v7, 8, v7 +; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8 +; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB17_2 @@ -2051,7 +2057,7 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB17_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_and_b32_e32 v2, -8, v4 +; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2