@@ -75,6 +75,13 @@ static cl::opt<size_t> InlineMaxBB(
7575 cl::desc(" Maximum number of BBs allowed in a function after inlining"
7676 " (compile time constraint)" ));
7777
78+ // This default unroll factor is based on microbenchmarks on gfx1030.
79+ static cl::opt<unsigned > MemcpyLoopUnroll (
80+ " amdgpu-memcpy-loop-unroll" ,
81+ cl::desc (" Unroll factor (affecting 4x32-bit operations) to use for memory "
82+ " operations when lowering memcpy as a loop" ),
83+ cl::init(16 ), cl::Hidden);
84+
7885static bool dependsOnLocalPhi (const Loop *L, const Value *Cond,
7986 unsigned Depth = 0 ) {
8087 const Instruction *I = dyn_cast<Instruction>(Cond);
@@ -409,13 +416,8 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
409416 return 1024 ;
410417}
411418
412- // FIXME: Really we would like to issue multiple 128-bit loads and stores per
413- // iteration. Should we report a larger size and let it legalize?
414- //
415419// FIXME: Should we use narrower types for local/region, or account for when
416420// unaligned access is legal?
417- //
418- // FIXME: This could use fine tuning and microbenchmarks.
419421Type *GCNTTIImpl::getMemcpyLoopLoweringType (
420422 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
421423 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -442,17 +444,29 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
442444 return FixedVectorType::get (Type::getInt32Ty (Context), 2 );
443445 }
444446
445- // Global memory works best with 16-byte accesses. Private memory will also
446- // hit this, although they'll be decomposed.
447- return FixedVectorType::get (Type::getInt32Ty (Context), 4 );
447+ // Global memory works best with 16-byte accesses.
448+ // If the operation has a fixed known length that is large enough, it is
449+ // worthwhile to return an even wider type and let legalization lower it into
450+ // multiple accesses, effectively unrolling the memcpy loop. Private memory
451+ // also hits this, although accesses may be decomposed.
452+ //
453+ // Don't unroll if Length is not a constant, since unrolling leads to worse
454+ // performance for length values that are smaller or slightly larger than the
455+ // total size of the type returned here. Mitigating that would require a more
456+ // complex lowering for variable-length memcpy and memmove.
457+ unsigned I32EltsInVector = 4 ;
458+ if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Length))
459+ return FixedVectorType::get (Type::getInt32Ty (Context),
460+ MemcpyLoopUnroll * I32EltsInVector);
461+
462+ return FixedVectorType::get (Type::getInt32Ty (Context), I32EltsInVector);
448463}
449464
450465void GCNTTIImpl::getMemcpyLoopResidualLoweringType (
451466 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
452467 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
453468 Align SrcAlign, Align DestAlign,
454469 std::optional<uint32_t > AtomicCpySize) const {
455- assert (RemainingBytes < 16 );
456470
457471 if (AtomicCpySize)
458472 BaseT::getMemcpyLoopResidualLoweringType (
@@ -462,6 +476,12 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
462476 Align MinAlign = std::min (SrcAlign, DestAlign);
463477
464478 if (MinAlign != Align (2 )) {
479+ Type *I32x4Ty = FixedVectorType::get (Type::getInt32Ty (Context), 4 );
480+ while (RemainingBytes >= 16 ) {
481+ OpsOut.push_back (I32x4Ty);
482+ RemainingBytes -= 16 ;
483+ }
484+
465485 Type *I64Ty = Type::getInt64Ty (Context);
466486 while (RemainingBytes >= 8 ) {
467487 OpsOut.push_back (I64Ty);
0 commit comments