@@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
416416 return 1024 ;
417417}
418418
419- // FIXME: Should we use narrower types for local/region, or account for when
420- // unaligned access is legal?
421419Type *GCNTTIImpl::getMemcpyLoopLoweringType (
422420 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
423421 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
426424 if (AtomicElementSize)
427425 return Type::getIntNTy (Context, *AtomicElementSize * 8 );
428426
429- Align MinAlign = std::min (SrcAlign, DestAlign);
430-
431- // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
432- // hardware into byte accesses. If you assume all alignments are equally
433- // probable, it's more efficient on average to use short accesses for this
434- // case.
435- if (MinAlign == Align (2 ))
436- return Type::getInt16Ty (Context);
437-
438- // Not all subtargets have 128-bit DS instructions, and we currently don't
439- // form them by default.
440- if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
441- SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
442- DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
443- DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
444- return FixedVectorType::get (Type::getInt32Ty (Context), 2 );
445- }
446-
447- // Global memory works best with 16-byte accesses.
427+ // 16-byte accesses achieve the highest copy throughput.
448428 // If the operation has a fixed known length that is large enough, it is
449429 // worthwhile to return an even wider type and let legalization lower it into
450- // multiple accesses, effectively unrolling the memcpy loop. Private memory
451- // also hits this, although accesses may be decomposed.
430+ // multiple accesses, effectively unrolling the memcpy loop.
431+ // We also rely on legalization to decompose into smaller accesses for
432+ // subtargets and address spaces where it is necessary.
452433 //
453434 // Don't unroll if Length is not a constant, since unrolling leads to worse
454435 // performance for length values that are smaller or slightly larger than the
@@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
473454 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
474455 DestAlign, AtomicCpySize);
475456
476- Align MinAlign = std::min (SrcAlign, DestAlign);
477-
478- if (MinAlign != Align (2 )) {
479- Type *I32x4Ty = FixedVectorType::get (Type::getInt32Ty (Context), 4 );
480- while (RemainingBytes >= 16 ) {
481- OpsOut.push_back (I32x4Ty);
482- RemainingBytes -= 16 ;
483- }
457+ Type *I32x4Ty = FixedVectorType::get (Type::getInt32Ty (Context), 4 );
458+ while (RemainingBytes >= 16 ) {
459+ OpsOut.push_back (I32x4Ty);
460+ RemainingBytes -= 16 ;
461+ }
484462
485- Type *I64Ty = Type::getInt64Ty (Context);
486- while (RemainingBytes >= 8 ) {
487- OpsOut.push_back (I64Ty);
488- RemainingBytes -= 8 ;
489- }
463+ Type *I64Ty = Type::getInt64Ty (Context);
464+ while (RemainingBytes >= 8 ) {
465+ OpsOut.push_back (I64Ty);
466+ RemainingBytes -= 8 ;
467+ }
490468
491- Type *I32Ty = Type::getInt32Ty (Context);
492- while (RemainingBytes >= 4 ) {
493- OpsOut.push_back (I32Ty);
494- RemainingBytes -= 4 ;
495- }
469+ Type *I32Ty = Type::getInt32Ty (Context);
470+ while (RemainingBytes >= 4 ) {
471+ OpsOut.push_back (I32Ty);
472+ RemainingBytes -= 4 ;
496473 }
497474
498475 Type *I16Ty = Type::getInt16Ty (Context);
0 commit comments