diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 28016b5936ccf..3b9f74884aa83 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -676,7 +676,17 @@ static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, // // We could handle more complicated cases, but it'd make things a lot more // complicated. - if (isa(AccessTy)) { + + // If both are pointer types, verify if they are compatible to copy across + // address spaces. + bool canCopyAcrossAddressSpaces = true; + if (AccessTy->isPtrOrPtrVectorTy() && VecTy->isPtrOrPtrVectorTy()) { + if (DL.getPointerSize(AccessTy->getPointerAddressSpace()) != + DL.getPointerSize(VecTy->getPointerAddressSpace())) + canCopyAcrossAddressSpaces = false; + } + + if (isa(AccessTy) && canCopyAcrossAddressSpaces) { TypeSize AccTS = DL.getTypeStoreSize(AccessTy); TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType()); return AccTS.isKnownMultipleOf(VecTS); @@ -800,11 +810,15 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { Ptr = Ptr->stripPointerCasts(); - // Alloca already accessed as vector. - if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) == - DL->getTypeStoreSize(AccessTy)) { - WorkList.push_back(Inst); - continue; + // Since the size of pointer is different across address spaces, let the + // isSupportedAccessType() handle the pointer load and store accesses. + if (!AccessTy->isPtrOrPtrVectorTy() || !VectorTy->isPtrOrPtrVectorTy()) { + // Alloca already accessed as vector. + if (Ptr == &Alloca && + DL->getTypeStoreSize(AllocaTy) == DL->getTypeStoreSize(AccessTy)) { + WorkList.push_back(Inst); + continue; + } } if (!isSupportedAccessType(VectorTy, AccessTy, *DL)) diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll index 1e49500a243e1..9dde947b0b0c6 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll @@ -198,3 +198,33 @@ entry: %tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8 ret ptr addrspace(3) %tmp } + +; Will not vectorize because we're saving a 64-bit pointer from addrspace 0 +; in to two 32 bits pointers of addrspace 5. +; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_addrspace_ptrvec +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca <2 x ptr addrspace(5)>, align 8, addrspace(5) +; CHECK-NEXT: store ptr undef, ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: ret void +; +define void @alloca_load_store_ptr_mixed_addrspace_ptrvec() { +entry: + %A2 = alloca <2 x ptr addrspace(5)>, align 8, addrspace(5) + store ptr undef, ptr addrspace(5) %A2, align 8 + ret void +} + +; Will not vectorize because we're saving a 32-bit pointers from addrspace 5 +; in to two 64 bits pointers of addrspace 0, even though the size in memory +; is same. +; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_addrspace_ptrvec2 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca <2 x ptr>, align 8 +; CHECK-NEXT: store <4 x ptr addrspace(5)> undef, ptr [[ALLOCA]], align 8 +; CHECK-NEXT: ret void +define void @alloca_load_store_ptr_mixed_addrspace_ptrvec2() { +entry: + %A2 = alloca <2 x ptr>, align 8 + store <4 x ptr addrspace(5)> undef, ptr %A2, align 8 + ret void +}