diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 94ecb6ba9a2b8..6c01f6dd370f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -729,6 +729,11 @@ static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, // complicated. if (isa(AccessTy)) { TypeSize AccTS = DL.getTypeStoreSize(AccessTy); + // If the type size and the store size don't match, we would need to do more + // than just bitcast to translate between an extracted/insertable subvectors + // and the accessed value. + if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy)) + return false; TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType()); return AccTS.isKnownMultipleOf(VecTS); } @@ -813,15 +818,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (VectorType::isValidElementType(ElemTy) && NumElems > 0) { unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8; - unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy); - // Expand vector if required to match padding of inner type, - // i.e. odd size subvectors. - // Storage size of new vector must match that of alloca for correct - // behaviour of byte offsets and GEP computation. - if (NumElems * ElementSize != AllocaSize) - NumElems = AllocaSize / ElementSize; - if (NumElems > 0 && (AllocaSize % ElementSize) == 0) - VectorTy = FixedVectorType::get(ElemTy, NumElems); + if (ElementSize > 0) { + unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy); + // Expand vector if required to match padding of inner type, + // i.e. odd size subvectors. + // Storage size of new vector must match that of alloca for correct + // behaviour of byte offsets and GEP computation. + if (NumElems * ElementSize != AllocaSize) + NumElems = AllocaSize / ElementSize; + if (NumElems > 0 && (AllocaSize % ElementSize) == 0) + VectorTy = FixedVectorType::get(ElemTy, NumElems); + } } } @@ -861,7 +868,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n"); Type *VecEltTy = VectorTy->getElementType(); - unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; + unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy); + if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) { + LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " + "does not match the type's size\n"); + return false; + } + unsigned ElementSize = ElementSizeInBits / 8; + assert(ElementSize > 0); for (auto *U : Uses) { Instruction *Inst = cast(U->getUser()); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll new file mode 100644 index 0000000000000..4095347d78624 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll @@ -0,0 +1,204 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s + +; Check that types where the store/allocation sizes don't match the type size +; don't crash. + + +define <7 x i9> @load_elem_i9_access_7xi9() { +; CHECK-LABEL: @load_elem_i9_access_7xi9( +; CHECK-NEXT: [[P:%.*]] = alloca <16 x i9>, align 1, addrspace(5) +; CHECK-NEXT: [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4 +; CHECK-NEXT: [[L:%.*]] = load <7 x i9>, ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: ret <7 x i9> [[L]] +; + %p = alloca <16 x i9>, align 1, addrspace(5) + %g = getelementptr i8, ptr addrspace(5) %p, i64 4 + %l = load <7 x i9>, ptr addrspace(5) %g, align 1 + ret <7 x i9> %l +} + +define <8 x i1> @load_elem_i1_access_8xi1() { +; CHECK-LABEL: @load_elem_i1_access_8xi1( +; CHECK-NEXT: [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5) +; CHECK-NEXT: [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4 +; CHECK-NEXT: [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: ret <8 x i1> [[L]] +; + %p = alloca <16 x i1>, align 1, addrspace(5) + %g = getelementptr i8, ptr addrspace(5) %p, i64 4 + %l = load <8 x i1>, ptr addrspace(5) %g, align 1 + ret <8 x i1> %l +} + +define <3 x i1> @load_elem_i1_access_3xi1() { +; CHECK-LABEL: @load_elem_i1_access_3xi1( +; CHECK-NEXT: [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5) +; CHECK-NEXT: [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4 +; CHECK-NEXT: [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: ret <3 x i1> [[L]] +; + %p = alloca <16 x i1>, align 1, addrspace(5) + %g = getelementptr i8, ptr addrspace(5) %p, i64 4 + %l = load <3 x i1>, ptr addrspace(5) %g, align 1 + ret <3 x i1> %l +} + +define <3 x i1> @load_elem_i8_access_3xi1() { +; CHECK-LABEL: @load_elem_i8_access_3xi1( +; CHECK-NEXT: [[P:%.*]] = alloca <8 x i8>, align 1, addrspace(5) +; CHECK-NEXT: store <8 x i8> , ptr addrspace(5) [[P]], align 1 +; CHECK-NEXT: [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1 +; CHECK-NEXT: [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: ret <3 x i1> [[L]] +; + %p = alloca <8 x i8>, align 1, addrspace(5) + store <8 x i8> , ptr addrspace(5) %p, align 1 + %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1 + %l = load <3 x i1>, ptr addrspace(5) %g, align 1 + ret <3 x i1> %l +} + +; This one is actually not problematic. +define <8 x i1> @load_elem_i8_access_8xi1() { +; CHECK-LABEL: @load_elem_i8_access_8xi1( +; CHECK-NEXT: [[P:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: ret <8 x i1> +; + %p = alloca <8 x i8>, align 1, addrspace(5) + store <8 x i8> , ptr addrspace(5) %p, align 1 + %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1 + %l = load <8 x i1>, ptr addrspace(5) %g, align 1 + ret <8 x i1> %l +} + +define <8 x i1> @storeload_elem_i1_access_8xi1() { +; CHECK-LABEL: @storeload_elem_i1_access_8xi1( +; CHECK-NEXT: [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5) +; CHECK-NEXT: [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4 +; CHECK-NEXT: store <8 x i1> , ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: ret <8 x i1> [[L]] +; + %p = alloca <16 x i1>, align 1, addrspace(5) + %g = getelementptr i8, ptr addrspace(5) %p, i64 4 + store <8 x i1> , ptr addrspace(5) %g, align 1 + %l = load <8 x i1>, ptr addrspace(5) %g, align 1 + ret <8 x i1> %l +} + +define <3 x i1> @storeload_elem_i1_access_3xi1() { +; CHECK-LABEL: @storeload_elem_i1_access_3xi1( +; CHECK-NEXT: [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5) +; CHECK-NEXT: [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4 +; CHECK-NEXT: store <3 x i1> , ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: ret <3 x i1> [[L]] +; + %p = alloca <16 x i1>, align 1, addrspace(5) + %g = getelementptr i8, ptr addrspace(5) %p, i64 4 + store <3 x i1> , ptr addrspace(5) %g, align 1 + %l = load <3 x i1>, ptr addrspace(5) %g, align 1 + ret <3 x i1> %l +} + +define <3 x i1> @storeload_elem_i8_access_3xi1() { +; CHECK-LABEL: @storeload_elem_i8_access_3xi1( +; CHECK-NEXT: [[P:%.*]] = alloca <8 x i8>, align 1, addrspace(5) +; CHECK-NEXT: store <8 x i8> , ptr addrspace(5) [[P]], align 1 +; CHECK-NEXT: [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1 +; CHECK-NEXT: store <3 x i1> , ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: ret <3 x i1> [[L]] +; + %p = alloca <8 x i8>, align 1, addrspace(5) + store <8 x i8> , ptr addrspace(5) %p, align 1 + %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1 + store <3 x i1> , ptr addrspace(5) %g, align 1 + %l = load <3 x i1>, ptr addrspace(5) %g, align 1 + ret <3 x i1> %l +} + +; This one is actually not problematic. +define <8 x i1> @storeload_elem_i8_access_8xi1() { +; CHECK-LABEL: @storeload_elem_i8_access_8xi1( +; CHECK-NEXT: [[P:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: ret <8 x i1> +; + %p = alloca <8 x i8>, align 1, addrspace(5) + store <8 x i8> , ptr addrspace(5) %p, align 1 + %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1 + store <8 x i1> , ptr addrspace(5) %g, align 1 + %l = load <8 x i1>, ptr addrspace(5) %g, align 1 + ret <8 x i1> %l +} + +define <8 x i1> @array_of_vec_elem_i1_access_8xi1() { +; CHECK-LABEL: @array_of_vec_elem_i1_access_8xi1( +; CHECK-NEXT: [[P:%.*]] = alloca [2 x <16 x i1>], align 1, addrspace(5) +; CHECK-NEXT: [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4 +; CHECK-NEXT: store <8 x i1> , ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: ret <8 x i1> [[L]] +; + %p = alloca [2 x <16 x i1>], align 1, addrspace(5) + %g = getelementptr i8, ptr addrspace(5) %p, i64 4 + store <8 x i1> , ptr addrspace(5) %g, align 1 + %l = load <8 x i1>, ptr addrspace(5) %g, align 1 + ret <8 x i1> %l +} + +define <3 x i1> @array_of_vec_elem_i1_access_3xi1() { +; CHECK-LABEL: @array_of_vec_elem_i1_access_3xi1( +; CHECK-NEXT: [[P:%.*]] = alloca [2 x <16 x i1>], align 1, addrspace(5) +; CHECK-NEXT: [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4 +; CHECK-NEXT: store <3 x i1> , ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: ret <3 x i1> [[L]] +; + %p = alloca [2 x <16 x i1>], align 1, addrspace(5) + %g = getelementptr i8, ptr addrspace(5) %p, i64 4 + store <3 x i1> , ptr addrspace(5) %g, align 1 + %l = load <3 x i1>, ptr addrspace(5) %g, align 1 + ret <3 x i1> %l +} + +define <3 x i1> @array_of_vec_elem_i8_access_3xi1() { +; CHECK-LABEL: @array_of_vec_elem_i8_access_3xi1( +; CHECK-NEXT: [[P:%.*]] = alloca [2 x <8 x i8>], align 1, addrspace(5) +; CHECK-NEXT: store <8 x i8> , ptr addrspace(5) [[P]], align 1 +; CHECK-NEXT: [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1 +; CHECK-NEXT: store <3 x i1> , ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1 +; CHECK-NEXT: ret <3 x i1> [[L]] +; + %p = alloca [2 x <8 x i8>], align 1, addrspace(5) + store <8 x i8> , ptr addrspace(5) %p, align 1 + %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1 + store <3 x i1> , ptr addrspace(5) %g, align 1 + %l = load <3 x i1>, ptr addrspace(5) %g, align 1 + ret <3 x i1> %l +} + +; This one is actually not problematic. +define <8 x i1> @array_of_vec_elem_i8_access_8xi1() { +; CHECK-LABEL: @array_of_vec_elem_i8_access_8xi1( +; CHECK-NEXT: [[P:%.*]] = freeze <16 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> [[P]], i8 1, i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 2, i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 3, i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 4, i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 5, i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 6, i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 7, i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 8, i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 5, i32 4 +; CHECK-NEXT: ret <8 x i1> +; + %p = alloca [2 x <8 x i8>], align 1, addrspace(5) + store <8 x i8> , ptr addrspace(5) %p, align 1 + %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1 + store <8 x i1> , ptr addrspace(5) %g, align 1 + %l = load <8 x i1>, ptr addrspace(5) %g, align 1 + ret <8 x i1> %l +}