diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 94ecb6ba9a2b8..6c01f6dd370f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -729,6 +729,11 @@ static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
   // complicated.
   if (isa<FixedVectorType>(AccessTy)) {
     TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
+    // If the type size and the store size don't match, we would need to do more
+    // than just bitcast to translate between an extracted/insertable subvectors
+    // and the accessed value.
+    if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
+      return false;
     TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
     return AccTS.isKnownMultipleOf(VecTS);
   }
@@ -813,15 +818,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
     if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
       unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
-      unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
-      // Expand vector if required to match padding of inner type,
-      // i.e. odd size subvectors.
-      // Storage size of new vector must match that of alloca for correct
-      // behaviour of byte offsets and GEP computation.
-      if (NumElems * ElementSize != AllocaSize)
-        NumElems = AllocaSize / ElementSize;
-      if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
-        VectorTy = FixedVectorType::get(ElemTy, NumElems);
+      if (ElementSize > 0) {
+        unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+        // Expand vector if required to match padding of inner type,
+        // i.e. odd size subvectors.
+        // Storage size of new vector must match that of alloca for correct
+        // behaviour of byte offsets and GEP computation.
+        if (NumElems * ElementSize != AllocaSize)
+          NumElems = AllocaSize / ElementSize;
+        if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
+          VectorTy = FixedVectorType::get(ElemTy, NumElems);
+      }
     }
   }
 
@@ -861,7 +868,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
   Type *VecEltTy = VectorTy->getElementType();
-  unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
+    return false;
+  }
+  unsigned ElementSize = ElementSizeInBits / 8;
+  assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll
new file mode 100644
index 0000000000000..4095347d78624
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-byte-sizes.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; Check that types where the store/allocation sizes don't match the type size
+; don't crash.
+
+
+define <7 x i9> @load_elem_i9_access_7xi9() {
+; CHECK-LABEL: @load_elem_i9_access_7xi9(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i9>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <7 x i9>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <7 x i9> [[L]]
+;
+  %p = alloca <16 x i9>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <7 x i9>, ptr addrspace(5) %g, align 1
+  ret <7 x i9> %l
+}
+
+define <8 x i1> @load_elem_i1_access_8xi1() {
+; CHECK-LABEL: @load_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @load_elem_i1_access_3xi1() {
+; CHECK-LABEL: @load_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @load_elem_i8_access_3xi1() {
+; CHECK-LABEL: @load_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <8 x i8>, align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @load_elem_i8_access_8xi1() {
+; CHECK-LABEL: @load_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <8 x i1> @storeload_elem_i1_access_8xi1() {
+; CHECK-LABEL: @storeload_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @storeload_elem_i1_access_3xi1() {
+; CHECK-LABEL: @storeload_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <16 x i1>, align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <16 x i1>, align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @storeload_elem_i8_access_3xi1() {
+; CHECK-LABEL: @storeload_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca <8 x i8>, align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @storeload_elem_i8_access_8xi1() {
+; CHECK-LABEL: @storeload_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca <8 x i8>, align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <8 x i1> @array_of_vec_elem_i1_access_8xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i1_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <16 x i1>], align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <8 x i1> [[L]]
+;
+  %p = alloca [2 x <16 x i1>], align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}
+
+define <3 x i1> @array_of_vec_elem_i1_access_3xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i1_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <16 x i1>], align 1, addrspace(5)
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i8, ptr addrspace(5) [[P]], i64 4
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca [2 x <16 x i1>], align 1, addrspace(5)
+  %g = getelementptr i8, ptr addrspace(5) %p, i64 4
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+define <3 x i1> @array_of_vec_elem_i8_access_3xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i8_access_3xi1(
+; CHECK-NEXT:    [[P:%.*]] = alloca [2 x <8 x i8>], align 1, addrspace(5)
+; CHECK-NEXT:    store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) [[P]], align 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr <4 x i8>, ptr addrspace(5) [[P]], i64 1
+; CHECK-NEXT:    store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    [[L:%.*]] = load <3 x i1>, ptr addrspace(5) [[G]], align 1
+; CHECK-NEXT:    ret <3 x i1> [[L]]
+;
+  %p = alloca [2 x <8 x i8>], align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <3 x i1> <i1 true, i1 false, i1 true>, ptr addrspace(5) %g, align 1
+  %l = load <3 x i1>, ptr addrspace(5) %g, align 1
+  ret <3 x i1> %l
+}
+
+; This one is actually not problematic.
+define <8 x i1> @array_of_vec_elem_i8_access_8xi1() {
+; CHECK-LABEL: @array_of_vec_elem_i8_access_8xi1(
+; CHECK-NEXT:    [[P:%.*]] = freeze <16 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> [[P]], i8 1, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 2, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 3, i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 4, i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 5, i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 6, i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 7, i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 8, i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 5, i32 4
+; CHECK-NEXT:    ret <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+;
+  %p = alloca [2 x <8 x i8>], align 1, addrspace(5)
+  store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, ptr addrspace(5) %p, align 1
+  %g = getelementptr <4 x i8>, ptr addrspace(5) %p, i64 1
+  store <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, ptr addrspace(5) %g, align 1
+  %l = load <8 x i1>, ptr addrspace(5) %g, align 1
+  ret <8 x i1> %l
+}