4545//
4646// This pass proceeds in three main phases:
4747//
48- // ## Rewriting loads and stores of p7
48+ // ## Rewriting loads and stores of p7 and memcpy()-like handling
4949//
5050// The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`,
5151// including aggregates containing such pointers, to ones that use `i160`. This
52- // is handled by `StoreFatPtrsAsIntsVisitor ` , which visits loads, stores, and
53- // allocas and, if the loaded or stored type contains `ptr addrspace(7)`,
54- // rewrites that type to one where the p7s are replaced by i160s, copying other
55- // parts of aggregates as needed. In the case of a store, each pointer is
56- // `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back.
57- // This same transformation is applied to vectors of pointers.
52+ // is handled by `StoreFatPtrsAsIntsAndExpandMemcpyVisitor ` , which visits
53+ // loads, stores, and allocas and, if the loaded or stored type contains `ptr
54+ // addrspace(7)`, rewrites that type to one where the p7s are replaced by i160s,
55+ // copying other parts of aggregates as needed. In the case of a store, each
56+ // pointer is `ptrtoint`d to i160 before storing, and load integers are
57+ // `inttoptr`d back. This same transformation is applied to vectors of pointers.
5858//
5959// Such a transformation allows the later phases of the pass to not need
6060// to handle buffer fat pointers moving to and from memory, where we load
6666// Atomics operations on `ptr addrspace(7)` values are not suppported, as the
6767// hardware does not include a 160-bit atomic.
6868//
69+ // In order to save on O(N) work and to ensure that the contents type
70+ // legalizer correctly splits up wide loads, also unconditionally lower
71+ // memcpy-like intrinsics into loops here.
72+ //
6973// ## Buffer contents type legalization
7074//
7175// The underlying buffer intrinsics only support types up to 128 bits long,
231235#include " llvm/IR/InstIterator.h"
232236#include " llvm/IR/InstVisitor.h"
233237#include " llvm/IR/Instructions.h"
238+ #include " llvm/IR/IntrinsicInst.h"
234239#include " llvm/IR/Intrinsics.h"
235240#include " llvm/IR/IntrinsicsAMDGPU.h"
236241#include " llvm/IR/Metadata.h"
237242#include " llvm/IR/Operator.h"
238243#include " llvm/IR/PatternMatch.h"
239244#include " llvm/IR/ReplaceConstant.h"
245+ #include " llvm/IR/ValueHandle.h"
240246#include " llvm/InitializePasses.h"
241247#include " llvm/Pass.h"
248+ #include " llvm/Support/AMDGPUAddrSpace.h"
242249#include " llvm/Support/Alignment.h"
243250#include " llvm/Support/AtomicOrdering.h"
244251#include " llvm/Support/Debug.h"
245252#include " llvm/Support/ErrorHandling.h"
246253#include " llvm/Transforms/Utils/Cloning.h"
247254#include " llvm/Transforms/Utils/Local.h"
255+ #include " llvm/Transforms/Utils/LowerMemIntrinsics.h"
248256#include " llvm/Transforms/Utils/ValueMapper.h"
249257
250258#define DEBUG_TYPE " amdgpu-lower-buffer-fat-pointers"
@@ -431,14 +439,16 @@ namespace {
431439// / marshalling costs when reading or storing these values, but since placing
432440// / such pointers into memory is an uncommon operation at best, we feel that
433441// / this cost is acceptable for better performance in the common case.
434- class StoreFatPtrsAsIntsVisitor
435- : public InstVisitor<StoreFatPtrsAsIntsVisitor , bool > {
442+ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
443+ : public InstVisitor<StoreFatPtrsAsIntsAndExpandMemcpyVisitor , bool > {
436444 BufferFatPtrToIntTypeMap *TypeMap;
437445
438446 ValueToValueMapTy ConvertedForStore;
439447
440448 IRBuilder<> IRB;
441449
450+ const TargetMachine *TM;
451+
442452 // Convert all the buffer fat pointers within the input value to inttegers
443453 // so that it can be stored in memory.
444454 Value *fatPtrsToInts (Value *V, Type *From, Type *To, const Twine &Name);
@@ -448,20 +458,27 @@ class StoreFatPtrsAsIntsVisitor
448458 Value *intsToFatPtrs (Value *V, Type *From, Type *To, const Twine &Name);
449459
450460public:
451- StoreFatPtrsAsIntsVisitor (BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx)
452- : TypeMap(TypeMap), IRB(Ctx) {}
461+ StoreFatPtrsAsIntsAndExpandMemcpyVisitor (BufferFatPtrToIntTypeMap *TypeMap,
462+ LLVMContext &Ctx,
463+ const TargetMachine *TM)
464+ : TypeMap(TypeMap), IRB(Ctx), TM(TM) {}
453465 bool processFunction (Function &F);
454466
455467 bool visitInstruction (Instruction &I) { return false ; }
456468 bool visitAllocaInst (AllocaInst &I);
457469 bool visitLoadInst (LoadInst &LI);
458470 bool visitStoreInst (StoreInst &SI);
459471 bool visitGetElementPtrInst (GetElementPtrInst &I);
472+
473+ bool visitMemCpyInst (MemCpyInst &MCI);
474+ bool visitMemMoveInst (MemMoveInst &MMI);
475+ bool visitMemSetInst (MemSetInst &MSI);
476+ bool visitMemSetPatternInst (MemSetPatternInst &MSPI);
460477};
461478} // namespace
462479
463- Value *StoreFatPtrsAsIntsVisitor ::fatPtrsToInts (Value *V, Type *From, Type *To,
464- const Twine &Name) {
480+ Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::fatPtrsToInts (
481+ Value *V, Type *From, Type *To, const Twine &Name) {
465482 if (From == To)
466483 return V;
467484 ValueToValueMapTy::iterator Find = ConvertedForStore.find (V);
@@ -498,8 +515,8 @@ Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To,
498515 return Ret;
499516}
500517
501- Value *StoreFatPtrsAsIntsVisitor ::intsToFatPtrs (Value *V, Type *From, Type *To,
502- const Twine &Name) {
518+ Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::intsToFatPtrs (
519+ Value *V, Type *From, Type *To, const Twine &Name) {
503520 if (From == To)
504521 return V;
505522 if (isBufferFatPtrOrVector (To)) {
@@ -531,18 +548,25 @@ Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To,
531548 return Ret;
532549}
533550
534- bool StoreFatPtrsAsIntsVisitor ::processFunction (Function &F) {
551+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::processFunction (Function &F) {
535552 bool Changed = false ;
536- // The visitors will mutate GEPs and allocas, but will push loads and stores
537- // to the worklist to avoid invalidation.
553+ // Process memcpy-like instructions after the main iteration because they can
554+ // invalidate iterators.
555+ SmallVector<WeakTrackingVH> CanBecomeLoops;
538556 for (Instruction &I : make_early_inc_range (instructions (F))) {
539- Changed |= visit (I);
557+ if (isa<MemTransferInst, MemSetInst, MemSetPatternInst>(I))
558+ CanBecomeLoops.push_back (&I);
559+ else
560+ Changed |= visit (I);
561+ }
562+ for (WeakTrackingVH VH : make_early_inc_range (CanBecomeLoops)) {
563+ Changed |= visit (cast<Instruction>(VH));
540564 }
541565 ConvertedForStore.clear ();
542566 return Changed;
543567}
544568
545- bool StoreFatPtrsAsIntsVisitor ::visitAllocaInst (AllocaInst &I) {
569+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::visitAllocaInst (AllocaInst &I) {
546570 Type *Ty = I.getAllocatedType ();
547571 Type *NewTy = TypeMap->remapType (Ty);
548572 if (Ty == NewTy)
@@ -551,7 +575,8 @@ bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) {
551575 return true ;
552576}
553577
554- bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst (GetElementPtrInst &I) {
578+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitGetElementPtrInst (
579+ GetElementPtrInst &I) {
555580 Type *Ty = I.getSourceElementType ();
556581 Type *NewTy = TypeMap->remapType (Ty);
557582 if (Ty == NewTy)
@@ -563,7 +588,7 @@ bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
563588 return true ;
564589}
565590
566- bool StoreFatPtrsAsIntsVisitor ::visitLoadInst (LoadInst &LI) {
591+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::visitLoadInst (LoadInst &LI) {
567592 Type *Ty = LI.getType ();
568593 Type *IntTy = TypeMap->remapType (Ty);
569594 if (Ty == IntTy)
@@ -581,7 +606,7 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
581606 return true ;
582607}
583608
584- bool StoreFatPtrsAsIntsVisitor ::visitStoreInst (StoreInst &SI) {
609+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor ::visitStoreInst (StoreInst &SI) {
585610 Value *V = SI.getValueOperand ();
586611 Type *Ty = V->getType ();
587612 Type *IntTy = TypeMap->remapType (Ty);
@@ -597,6 +622,47 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
597622 return true ;
598623}
599624
625+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemCpyInst (
626+ MemCpyInst &MCI) {
627+ // TODO: Allow memcpy.p7.p3 as a synonym for the direct-to-LDS copy, which'll
628+ // need loop expansion here.
629+ if (MCI.getSourceAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER &&
630+ MCI.getDestAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER)
631+ return false ;
632+ llvm::expandMemCpyAsLoop (&MCI,
633+ TM->getTargetTransformInfo (*MCI.getFunction ()));
634+ MCI.eraseFromParent ();
635+ return true ;
636+ }
637+
638+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemMoveInst (
639+ MemMoveInst &MMI) {
640+ if (MMI.getSourceAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER &&
641+ MMI.getDestAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER)
642+ return false ;
643+ report_fatal_error (
644+ " memmove() on buffer descriptors is not implemented because pointer "
645+ " comparison on buffer descriptors isn't implemented\n " );
646+ }
647+
648+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst (
649+ MemSetInst &MSI) {
650+ if (MSI.getDestAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER)
651+ return false ;
652+ llvm::expandMemSetAsLoop (&MSI);
653+ MSI.eraseFromParent ();
654+ return true ;
655+ }
656+
657+ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetPatternInst (
658+ MemSetPatternInst &MSPI) {
659+ if (MSPI.getDestAddressSpace () != AMDGPUAS::BUFFER_FAT_POINTER)
660+ return false ;
661+ llvm::expandMemSetPatternAsLoop (&MSPI);
662+ MSPI.eraseFromParent ();
663+ return true ;
664+ }
665+
600666namespace {
601667// / Convert loads/stores of types that the buffer intrinsics can't handle into
602668// / one ore more such loads/stores that consist of legal types.
@@ -1127,6 +1193,7 @@ bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
11271193
11281194bool LegalizeBufferContentTypesVisitor::processFunction (Function &F) {
11291195 bool Changed = false ;
1196+ // Note, memory transfer intrinsics won't
11301197 for (Instruction &I : make_early_inc_range (instructions (F))) {
11311198 Changed |= visit (I);
11321199 }
@@ -2084,6 +2151,12 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
20842151 case Intrinsic::invariant_end:
20852152 case Intrinsic::launder_invariant_group:
20862153 case Intrinsic::strip_invariant_group:
2154+ case Intrinsic::memcpy:
2155+ case Intrinsic::memcpy_inline:
2156+ case Intrinsic::memmove:
2157+ case Intrinsic::memset:
2158+ case Intrinsic::memset_inline:
2159+ case Intrinsic::experimental_memset_pattern:
20872160 return true ;
20882161 }
20892162}
@@ -2353,7 +2426,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
23532426 /* RemoveDeadConstants=*/ false , /* IncludeSelf=*/ true );
23542427 }
23552428
2356- StoreFatPtrsAsIntsVisitor MemOpsRewrite (&IntTM, M.getContext ());
2429+ StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite (&IntTM, M.getContext (),
2430+ &TM);
23572431 LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite (DL,
23582432 M.getContext ());
23592433 for (Function &F : M.functions ()) {
0 commit comments