Skip to content

Commit aa9a039

Browse files
committed
[AMDGPU] Use wider loop lowering type for LowerMemIntrinsics
When llvm.memcpy or llvm.memmove intrinsics are lowered as a loop in LowerMemIntrinsics.cpp, the loop consists of a single load/store pair per iteration. We can improve performance in some cases by emitting multiple load/store pairs per iteration. This patch achieves that by increasing the width of the loop lowering type in the GCN target and letting legalization split the resulting too-wide access pairs into multiple legal access pairs. This change only affects lowered memcpys and memmoves with large (>= 1024 bytes) constant lengths. Smaller constant lengths are handled by ISel directly; non-constant lengths would be slowed down by this change if the dynamic length was smaller or slightly larger than what an unrolled iteration copies. The chosen default unroll factor is the result of microbenchmarks on gfx1030. This change leads to speedups of 15-38% for global memory and 1.9-5.8x for scratch in these microbenchmarks. Part of SWDEV-455845.
1 parent ac5a201 commit aa9a039

File tree

4 files changed

+2748
-48
lines changed

4 files changed

+2748
-48
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,13 @@ static cl::opt<size_t> InlineMaxBB(
7575
cl::desc("Maximum number of BBs allowed in a function after inlining"
7676
" (compile time constraint)"));
7777

78+
// This default unroll factor is based on microbenchmarks on gfx1030.
79+
static cl::opt<unsigned> MemcpyLoopUnroll(
80+
"amdgpu-memcpy-loop-unroll",
81+
cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
82+
"operations when lowering memcpy as a loop, must be a power of 2"),
83+
cl::init(16), cl::Hidden);
84+
7885
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
7986
unsigned Depth = 0) {
8087
const Instruction *I = dyn_cast<Instruction>(Cond);
@@ -409,13 +416,8 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
409416
return 1024;
410417
}
411418

412-
// FIXME: Really we would like to issue multiple 128-bit loads and stores per
413-
// iteration. Should we report a larger size and let it legalize?
414-
//
415419
// FIXME: Should we use narrower types for local/region, or account for when
416420
// unaligned access is legal?
417-
//
418-
// FIXME: This could use fine tuning and microbenchmarks.
419421
Type *GCNTTIImpl::getMemcpyLoopLoweringType(
420422
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
421423
unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -442,17 +444,46 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
442444
return FixedVectorType::get(Type::getInt32Ty(Context), 2);
443445
}
444446

445-
// Global memory works best with 16-byte accesses. Private memory will also
446-
// hit this, although they'll be decomposed.
447-
return FixedVectorType::get(Type::getInt32Ty(Context), 4);
447+
// Global memory works best with 16-byte accesses.
448+
// If the operation has a fixed known length that is large enough, it is
449+
// worthwhile to return an even wider type and let legalization lower it into
450+
// multiple accesses, effectively unrolling the memcpy loop. Private memory
451+
// also hits this, although accesses may be decomposed.
452+
//
453+
// Don't unroll if
454+
// - Length is not a constant, since unrolling leads to worse performance for
455+
// length values that are smaller or slightly larger than the total size of
456+
// the type returned here. Mitigating that would require a more complex
457+
// lowering for variable-length memcpy and memmove.
458+
// - the memory operations would be split further into byte-wise accesses
459+
// because of their (mis)alignment, since that would lead to a huge code
460+
// size increase.
461+
unsigned I32EltsInVector = 4;
462+
if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Length)) {
463+
unsigned VectorSizeBytes = I32EltsInVector * 4;
464+
unsigned VectorSizeBits = VectorSizeBytes * 8;
465+
unsigned UnrolledVectorBytes = VectorSizeBytes * MemcpyLoopUnroll;
466+
Align PartSrcAlign(commonAlignment(SrcAlign, UnrolledVectorBytes));
467+
Align PartDestAlign(commonAlignment(DestAlign, UnrolledVectorBytes));
468+
469+
const SITargetLowering *TLI = this->getTLI();
470+
bool SrcNotSplit = TLI->allowsMisalignedMemoryAccessesImpl(
471+
VectorSizeBits, SrcAddrSpace, PartSrcAlign);
472+
bool DestNotSplit = TLI->allowsMisalignedMemoryAccessesImpl(
473+
VectorSizeBits, DestAddrSpace, PartDestAlign);
474+
if (SrcNotSplit && DestNotSplit)
475+
return FixedVectorType::get(Type::getInt32Ty(Context),
476+
MemcpyLoopUnroll * I32EltsInVector);
477+
}
478+
479+
return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
448480
}
449481

450482
void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
451483
SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
452484
unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
453485
Align SrcAlign, Align DestAlign,
454486
std::optional<uint32_t> AtomicCpySize) const {
455-
assert(RemainingBytes < 16);
456487

457488
if (AtomicCpySize)
458489
BaseT::getMemcpyLoopResidualLoweringType(
@@ -462,6 +493,12 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
462493
Align MinAlign = std::min(SrcAlign, DestAlign);
463494

464495
if (MinAlign != Align(2)) {
496+
Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
497+
while (RemainingBytes >= 16) {
498+
OpsOut.push_back(I32x4Ty);
499+
RemainingBytes -= 16;
500+
}
501+
465502
Type *I64Ty = Type::getInt64Ty(Context);
466503
while (RemainingBytes >= 8) {
467504
OpsOut.push_back(I64Ty);

llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ void llvm::createMemCpyLoopKnownSize(
4848
Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
4949
assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
5050
"Atomic memcpy lowering is not supported for vector operand type");
51+
assert((DL.getTypeStoreSize(LoopOpType) == DL.getTypeAllocSize(LoopOpType)) &&
52+
"Bytes are missed if store and alloc size of the LoopOpType do not "
53+
"match");
5154

5255
Type *Int8Type = Type::getInt8Ty(Ctx);
5356
unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
@@ -200,6 +203,9 @@ void llvm::createMemCpyLoopUnknownSize(
200203
Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
201204
assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
202205
"Atomic memcpy lowering is not supported for vector operand type");
206+
assert((DL.getTypeStoreSize(LoopOpType) == DL.getTypeAllocSize(LoopOpType)) &&
207+
"Bytes are missed if store and alloc size of the LoopOpType do not "
208+
"match");
203209
unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
204210
assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
205211
"Atomic memcpy lowering is not supported for selected operand size");
@@ -414,6 +420,9 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
414420

415421
Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
416422
SrcAlign, DstAlign);
423+
assert((DL.getTypeStoreSize(LoopOpType) == DL.getTypeAllocSize(LoopOpType)) &&
424+
"Bytes are missed if store and alloc size of the LoopOpType do not "
425+
"match");
417426
unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
418427
Type *Int8Type = Type::getInt8Ty(Ctx);
419428
bool LoopOpIsInt8 = LoopOpType == Int8Type;
@@ -672,6 +681,9 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
672681

673682
Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
674683
SrcAlign, DstAlign);
684+
assert((DL.getTypeStoreSize(LoopOpType) == DL.getTypeAllocSize(LoopOpType)) &&
685+
"Bytes are missed if store and alloc size of the LoopOpType do not "
686+
"match");
675687
unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
676688
Type *Int8Type = Type::getInt8Ty(Ctx);
677689

llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll

Lines changed: 32 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -456,10 +456,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace
456456
; OPT: load-store-loop:
457457
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
458458
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
459-
; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
459+
; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
460460
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
461-
; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
462-
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
461+
; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
462+
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
463463
; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
464464
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
465465
; OPT: memcpy-split:
@@ -479,10 +479,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace
479479
; OPT: load-store-loop:
480480
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
481481
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
482-
; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
482+
; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
483483
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
484-
; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
485-
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
484+
; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
485+
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
486486
; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
487487
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
488488
; OPT: memcpy-split:
@@ -502,10 +502,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace
502502
; OPT: load-store-loop:
503503
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
504504
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
505-
; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
505+
; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
506506
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
507-
; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
508-
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
507+
; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
508+
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
509509
; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
510510
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
511511
; OPT: memcpy-split:
@@ -525,10 +525,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace
525525
; OPT: load-store-loop:
526526
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
527527
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
528-
; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
528+
; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
529529
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
530-
; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
531-
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
530+
; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
531+
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
532532
; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
533533
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
534534
; OPT: memcpy-split:
@@ -548,10 +548,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace
548548
; OPT: load-store-loop:
549549
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
550550
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
551-
; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
551+
; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
552552
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
553-
; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
554-
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
553+
; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
554+
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
555555
; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
556556
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
557557
; OPT: memcpy-split:
@@ -575,10 +575,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace
575575
; OPT: load-store-loop:
576576
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
577577
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
578-
; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
578+
; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
579579
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
580-
; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
581-
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
580+
; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
581+
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
582582
; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
583583
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
584584
; OPT: memcpy-split:
@@ -606,10 +606,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace
606606
; OPT: load-store-loop:
607607
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
608608
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
609-
; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
609+
; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
610610
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
611-
; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
612-
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
611+
; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
612+
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
613613
; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
614614
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
615615
; OPT: memcpy-split:
@@ -633,10 +633,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace
633633
; OPT: load-store-loop:
634634
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
635635
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
636-
; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
636+
; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
637637
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
638-
; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
639-
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
638+
; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
639+
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
640640
; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
641641
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
642642
; OPT: memcpy-split:
@@ -691,10 +691,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace
691691
; OPT: load-store-loop:
692692
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
693693
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
694-
; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
694+
; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
695695
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
696-
; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
697-
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
696+
; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
697+
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
698698
; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
699699
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
700700
; OPT: memcpy-split:
@@ -764,10 +764,10 @@ define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspa
764764
; OPT: load-store-loop:
765765
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
766766
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
767-
; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 4
767+
; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
768768
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
769-
; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
770-
; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 16
769+
; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
770+
; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
771771
; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
772772
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
773773
; OPT: memcpy-split:
@@ -1194,17 +1194,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_16(ptr addrspace(1
11941194
; MAX1024-NEXT: ret void
11951195
;
11961196
; ALL-LABEL: @memcpy_global_align4_global_align4_16(
1197-
; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
1198-
; ALL: load-store-loop:
1199-
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
1200-
; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
1197+
; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
12011198
; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
1202-
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
1199+
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
12031200
; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1204-
; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
1205-
; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
1206-
; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
1207-
; ALL: memcpy-split:
12081201
; ALL-NEXT: ret void
12091202
;
12101203
call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 16, i1 false)

0 commit comments

Comments
 (0)