diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 12351e98e5a2b..7caccf2a22f0a 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -2677,6 +2677,16 @@ class ASTContext : public RefCountedBase<ASTContext> {
     return getTypeSizeInCharsIfKnown(QualType(Ty, 0));
   }
 
+  /// Return the size of an element inside a given vector type.
+  uint64_t getVectorElementSize(const VectorType *VTy) const {
+    QualType EltTy = VTy->getElementType();
+    if (VTy->isPackedVectorBoolType(*this))
+      return 1;
+    if (EltTy->isBitIntType())
+      return EltTy->castAs<BitIntType>()->getNumBits();
+    return getTypeSize(EltTy);
+  }
+
   /// Return the ABI-specified alignment of a (complete) type \p T, in
   /// bits.
   unsigned getTypeAlign(QualType T) const { return getTypeInfo(T).Align; }
diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h
index 6786b2f6cbc78..283af7d97960f 100644
--- a/clang/include/clang/AST/TypeBase.h
+++ b/clang/include/clang/AST/TypeBase.h
@@ -2640,6 +2640,8 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   // Extended vector type with bool element that is packed. HLSL doesn't pack
   // its bool vectors.
   bool isPackedVectorBoolType(const ASTContext &ctx) const;
+  // Vector type with packed _BitInt elements.
+  bool isBitIntVectorType() const;
   bool isSubscriptableVectorType() const;
   bool isMatrixType() const;                    // Matrix type.
   bool isConstantMatrixType() const;            // Constant matrix type.
@@ -8681,6 +8683,11 @@ inline bool Type::isExtVectorBoolType() const {
   return cast<ExtVectorType>(CanonicalType)->getElementType()->isBooleanType();
 }
 
+inline bool Type::isBitIntVectorType() const {
+  return isVectorType() &&
+         cast<VectorType>(CanonicalType)->getElementType()->isBitIntType();
+}
+
 inline bool Type::isSubscriptableVectorType() const {
   return isVectorType() || isSveVLSBuiltinType();
 }
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 056bfe36b2a0a..cc91960cfe01b 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2093,10 +2093,7 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
   case Type::ExtVector:
   case Type::Vector: {
     const auto *VT = cast<VectorType>(T);
-    TypeInfo EltInfo = getTypeInfo(VT->getElementType());
-    Width = VT->isPackedVectorBoolType(*this)
-                ? VT->getNumElements()
-                : EltInfo.Width * VT->getNumElements();
+    Width = getVectorElementSize(VT) * VT->getNumElements();
     // Enforce at least byte size and alignment.
     Width = std::max<unsigned>(8, Width);
     Align = std::max<unsigned>(8, Width);
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index b706b14945b6d..4fdea501e0b57 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -7589,8 +7589,8 @@ class APValueToBufferConverter {
     QualType EltTy = VTy->getElementType();
     unsigned NElts = VTy->getNumElements();
 
-    if (VTy->isPackedVectorBoolType(Info.Ctx)) {
-      // Special handling for OpenCL bool vectors:
+    if (VTy->isPackedVectorBoolType(Info.Ctx) || VTy->isBitIntVectorType()) {
+      // Special handling for OpenCL bool and sub-byte vectors:
       // Since these vectors are stored as packed bits, but we can't write
       // individual bits to the BitCastBuffer, we'll buffer all of the elements
       // together into an appropriately sized APInt and write them all out at
@@ -7599,18 +7599,21 @@ class APValueToBufferConverter {
       // have to worry about writing data which should have been left
       // uninitialized.
       bool BigEndian = Info.Ctx.getTargetInfo().isBigEndian();
+      uint64_t EltSize = Info.Ctx.getVectorElementSize(VTy);
 
-      llvm::APInt Res = llvm::APInt::getZero(NElts);
+      llvm::APInt Res = llvm::APInt::getZero(NElts * EltSize);
       for (unsigned I = 0; I < NElts; ++I) {
         const llvm::APSInt &EltAsInt = Val.getVectorElt(I).getInt();
-        assert(EltAsInt.isUnsigned() && EltAsInt.getBitWidth() == 1 &&
-               "bool vector element must be 1-bit unsigned integer!");
-
-        Res.insertBits(EltAsInt, BigEndian ? (NElts - I - 1) : I);
+        assert(!VTy->isPackedVectorBoolType(Info.Ctx) ||
+               (EltAsInt.isUnsigned() && EltAsInt.getBitWidth()) == 1 &&
+                   "bool vector element must be 1-bit unsigned integer!");
+        uint64_t BitOffset = EltSize * (BigEndian ? (NElts - I - 1) : I);
+        Res.insertBits(EltAsInt, BitOffset);
       }
 
-      SmallVector<uint8_t, 8> Bytes(NElts / 8);
-      llvm::StoreIntToMemory(Res, &*Bytes.begin(), NElts / 8);
+      uint64_t NumBytes = NElts * EltSize / 8;
+      SmallVector<uint8_t, 8> Bytes(NumBytes);
+      llvm::StoreIntToMemory(Res, &*Bytes.begin(), NumBytes);
       Buffer.writeObject(Offset, Bytes);
     } else {
       // Iterate over each of the elements and write them out to the buffer at
@@ -7852,13 +7855,11 @@ class BufferToAPValueConverter {
   std::optional<APValue> visit(const VectorType *VTy, CharUnits Offset) {
     QualType EltTy = VTy->getElementType();
     unsigned NElts = VTy->getNumElements();
-    unsigned EltSize =
-        VTy->isPackedVectorBoolType(Info.Ctx) ? 1 : Info.Ctx.getTypeSize(EltTy);
 
     SmallVector<APValue, 4> Elts;
     Elts.reserve(NElts);
-    if (VTy->isPackedVectorBoolType(Info.Ctx)) {
-      // Special handling for OpenCL bool vectors:
+    if (VTy->isPackedVectorBoolType(Info.Ctx) || VTy->isBitIntVectorType()) {
+      // Special handling for OpenCL bool and sub-byte vectors:
       // Since these vectors are stored as packed bits, but we can't read
       // individual bits from the BitCastBuffer, we'll buffer all of the
       // elements together into an appropriately sized APInt and write them all
@@ -7867,20 +7868,22 @@ class BufferToAPValueConverter {
       // we don't have to worry about reading any padding data which didn't
       // actually need to be accessed.
       bool BigEndian = Info.Ctx.getTargetInfo().isBigEndian();
+      uint64_t EltSize = Info.Ctx.getVectorElementSize(VTy);
+      bool IsSigned = EltTy->isSignedIntegerType();
 
+      uint64_t NumBytes = NElts * EltSize / 8;
       SmallVector<uint8_t, 8> Bytes;
-      Bytes.reserve(NElts / 8);
-      if (!Buffer.readObject(Offset, CharUnits::fromQuantity(NElts / 8), Bytes))
+      Bytes.reserve(NumBytes);
+      if (!Buffer.readObject(Offset, CharUnits::fromQuantity(NumBytes), Bytes))
         return std::nullopt;
 
-      APSInt SValInt(NElts, true);
-      llvm::LoadIntFromMemory(SValInt, &*Bytes.begin(), Bytes.size());
+      APSInt SValInt(NElts * EltSize);
+      llvm::LoadIntFromMemory(SValInt, Bytes.data(), Bytes.size());
 
       for (unsigned I = 0; I < NElts; ++I) {
-        llvm::APInt Elt =
-            SValInt.extractBits(1, (BigEndian ? NElts - I - 1 : I) * EltSize);
-        Elts.emplace_back(
-            APSInt(std::move(Elt), !EltTy->isSignedIntegerType()));
+        uint64_t BitOffset = EltSize * (BigEndian ? (NElts - I - 1) : I);
+        llvm::APInt Elt = SValInt.extractBits(EltSize, BitOffset);
+        Elts.emplace_back(APSInt(std::move(Elt), !IsSigned));
       }
     } else {
       // Iterate over each of the elements and read them from the buffer at
@@ -7986,8 +7989,7 @@ static bool checkBitCastConstexprEligibilityType(SourceLocation Loc,
   if (const auto *VTy = Ty->getAs<VectorType>()) {
     QualType EltTy = VTy->getElementType();
     unsigned NElts = VTy->getNumElements();
-    unsigned EltSize =
-        VTy->isPackedVectorBoolType(Ctx) ? 1 : Ctx.getTypeSize(EltTy);
+    unsigned EltSize = Ctx.getVectorElementSize(VTy);
 
     if ((NElts * EltSize) % Ctx.getCharWidth() != 0) {
       // The vector's size in bits is not a multiple of the target's byte size,
diff --git a/clang/test/CodeGenCXX/ext-int.cpp b/clang/test/CodeGenCXX/ext-int.cpp
index a75b3701e36ef..0454363ca7f80 100644
--- a/clang/test/CodeGenCXX/ext-int.cpp
+++ b/clang/test/CodeGenCXX/ext-int.cpp
@@ -573,7 +573,7 @@ void VectorTest(uint16_t4 first, uint16_t4 second) {
 
 typedef unsigned _BitInt(4) uint4_t4 __attribute__((ext_vector_type(4)));
 void VectorTest(uint4_t4 first, uint4_t4 second) {
-  // LIN64: define{{.*}} void @_Z10VectorTestDv4_DU4_S0_(i32 %{{.+}}, i32 %{{.+}})
+  // LIN64: define{{.*}} void @_Z10VectorTestDv4_DU4_S0_(i16 %{{.+}}, i16 %{{.+}})
   // LIN32: define{{.*}} void @_Z10VectorTestDv4_DU4_S0_(<4 x i4> %{{.+}}, <4 x i4> %{{.+}})
   // WIN64: define dso_local void @"?VectorTest@@YAXT?$__vector@U?$_UBitInt@$03@__clang@@$03@__clang@@0@Z"(<4 x i4> %{{.+}}, <4 x i4> %{{.+}})
   // WIN32: define dso_local void @"?VectorTest@@YAXT?$__vector@U?$_UBitInt@$03@__clang@@$03@__clang@@0@Z"(<4 x i4> inreg %{{.+}}, <4 x i4> inreg %{{.+}})
@@ -585,23 +585,25 @@ void VectorTest(uint4_t4 first, uint4_t4 second) {
 
 typedef unsigned _BitInt(2) uint2_t2 __attribute__((ext_vector_type(2)));
 uint2_t2 TestBitIntVector2x2Alloca(uint2_t2 v1, uint2_t2 v2) {
-  // LIN64: define dso_local i16 @_Z25TestBitIntVector2x2AllocaDv2_DU2_S0_(i16 %[[V1Coerce:.+]], i16 %[[V2Coerce:.+]])
-  // LIN64: %[[RetVal:.+]] = alloca <2 x i2>, align 2
-  // LIN64: %[[V1Addr:.+]] = alloca <2 x i2>, align 2
-  // LIN64: %[[V2Addr:.+]] = alloca <2 x i2>, align 2
-  // LIN64: %[[RetValCoerce:.+]] = alloca i16, align 2
-  // LIN64: call void @llvm.memcpy.p0.p0.i64(ptr align 2 %[[RetValCoerce]], ptr align 2 %[[RetVal]], i64 1, i1 false)
-  // LIN64: %[[Ret:.+]] = load i16, ptr %[[RetValCoerce]], align 2
-  // LIN64: ret i16 %[[Ret]]
+  // LIN64: define dso_local i8 @_Z25TestBitIntVector2x2AllocaDv2_DU2_S0_(i8 %[[V1Coerce:.+]], i8 %[[V2Coerce:.+]])
+  // LIN64: %[[RetVal:.+]] = alloca <2 x i2>, align 1
+  // LIN64: %[[V1Addr:.+]] = alloca <2 x i2>, align 1
+  // LIN64: %[[V2Addr:.+]] = alloca <2 x i2>, align 1
+  // LIN64: %[[V1Val:.+]] = load <2 x i2>, ptr %[[V1Addr]], align 1
+  // LIN64: %[[V2Val:.+]] = load <2 x i2>, ptr %[[V2Addr]], align 1
+  // LIN64: %[[AddVal:.+]] = add <2 x i2> %0, %1
+  // LIN64: store <2 x i2> %[[AddVal]], ptr %[[RetVal]], align 1
+  // LIN64: %[[Ret:.+]] = load i8, ptr %[[RetVal]], align 1
+  // LIN64: ret i8 %[[Ret]]
 
   // LIN32: define dso_local <2 x i2> @_Z25TestBitIntVector2x2AllocaDv2_DU2_S0_(<2 x i2> %{{.+}}, <2 x i2> %{{.+}})
-  // LIN32: %[[V1Addr:.+]] = alloca <2 x i2>, align 2
-  // LIN32: %[[V2Addr:.+]] = alloca <2 x i2>, align 2
+  // LIN32: %[[V1Addr:.+]] = alloca <2 x i2>, align 1
+  // LIN32: %[[V2Addr:.+]] = alloca <2 x i2>, align 1
   // LIN32: ret <2 x i2> %[[Ret:.+]]
 
   // WIN: define dso_local <2 x i2> @"?TestBitIntVector2x2Alloca@@YAT?$__vector@U?$_UBitInt@$01@__clang@@$01@__clang@@T12@0@Z"(<2 x i2>{{.*}}, <2 x i2>{{.*}})
-  // WIN: %[[V1:.+]] = alloca <2 x i2>, align 2
-  // WIN: %[[V2:.+]] = alloca <2 x i2>, align 2
+  // WIN: %[[V1:.+]] = alloca <2 x i2>, align 1
+  // WIN: %[[V2:.+]] = alloca <2 x i2>, align 1
   // WIN: ret <2 x i2> %[[Ret:.+]]
   return v1 + v2;
 }
diff --git a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
index 2e7531b334ecb..98b868fcd5bc2 100644
--- a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
+++ b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
@@ -70,27 +70,25 @@ i512x3 v3(i512x3 a) {
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local i32 @_Z2v4Dv3_DB4_(
-// CHECK-SAME: i32 [[A_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-LABEL: define dso_local i16 @_Z2v4Dv3_DB4_(
+// CHECK-SAME: i16 [[A_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <3 x i4>, align 4
-// CHECK-NEXT:    [[A:%.*]] = alloca <3 x i4>, align 4
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i4>, align 4
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[A_COERCE]], ptr [[A]], align 4
-// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i4>, ptr [[A]], align 4
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <3 x i4>, align 2
+// CHECK-NEXT:    [[A:%.*]] = alloca <3 x i4>, align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i4>, align 2
+// CHECK-NEXT:    store i16 [[A_COERCE]], ptr [[A]], align 2
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i4>, ptr [[A]], align 2
 // CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i4> [[LOADVECN]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i4> [[A1]], <3 x i4> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i4> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
-// CHECK-NEXT:    [[LOADVECN2:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <4 x i4> [[EXTRACTVEC]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[LOADVECN2:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i4> [[LOADVECN2]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[LOADVECN4:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[LOADVECN4:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[EXTRACTVEC5:%.*]] = shufflevector <4 x i4> [[LOADVECN4]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
 // CHECK-NEXT:    [[ADD:%.*]] = add <3 x i4> [[EXTRACTVEC3]], [[EXTRACTVEC5]]
-// CHECK-NEXT:    store <3 x i4> [[ADD]], ptr [[RETVAL]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RETVAL_COERCE]], ptr align 4 [[RETVAL]], i64 2, i1 false)
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL_COERCE]], align 4
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    store <3 x i4> [[ADD]], ptr [[RETVAL]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[RETVAL]], align 2
+// CHECK-NEXT:    ret i16 [[TMP0]]
 //
 i4x3 v4(i4x3 a) {
   return a + a;
diff --git a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
index 7a6d7cb353158..893abf0636a5d 100644
--- a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
+++ b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
@@ -510,6 +510,56 @@ constexpr bool9 bad_short_to_bool9 = __builtin_bit_cast(bool9, static_cast<unsig
 // expected-note@+1 {{bit_cast involving type 'bool __attribute__((ext_vector_type(17)))' (vector of 17 'bool' values) is not allowed in a constant expression; element size 1 * element count 17 is not a multiple of the byte size 8}}
 constexpr bool17 bad_int_to_bool17 = __builtin_bit_cast(bool17, 0x0001CAFEU);
 
+template <int Bits, int N> using packed_vec_t = _BitInt(Bits) __attribute__((ext_vector_type(N)));
+
+static_assert(round_trip<packed_vec_t<2, 4>>(static_cast<unsigned char>(0)), "");
+static_assert(round_trip<packed_vec_t<2, 4>>(static_cast<unsigned char>(1)), "");
+static_assert(round_trip<packed_vec_t<2, 4>>(static_cast<unsigned char>(0x55)), "");
+static_assert(round_trip<packed_vec_t<2, 8>>(static_cast<short>(0)), "");
+static_assert(round_trip<packed_vec_t<2, 8>>(static_cast<short>(-1)), "");
+static_assert(round_trip<packed_vec_t<2, 8>>(static_cast<short>(0x5555)), "");
+
+static_assert(bit_cast<unsigned char>(packed_vec_t<2, 4>{1, -2, 0, -1}) == (LITTLE_END ? 0xC9 : 0x63), "");
+static_assert(bit_cast<unsigned short>(packed_vec_t<2, 8>{1, -2, 0, -1, -2, -1, 1, 0}) == (LITTLE_END ? 0x1EC9 : 0x63B4), "");
+
+static_assert(round_trip<packed_vec_t<4, 2>>(static_cast<unsigned char>(0)), "");
+static_assert(round_trip<packed_vec_t<4, 2>>(static_cast<unsigned char>(1)), "");
+static_assert(round_trip<packed_vec_t<4, 2>>(static_cast<unsigned char>(0x55)), "");
+static_assert(round_trip<packed_vec_t<4, 4>>(static_cast<short>(0)), "");
+static_assert(round_trip<packed_vec_t<4, 4>>(static_cast<short>(-1)), "");
+static_assert(round_trip<packed_vec_t<4, 4>>(static_cast<short>(0x5555)), "");
+
+static_assert(bit_cast<unsigned char>(packed_vec_t<4, 2>{-4, -7}) == (LITTLE_END ? 0x9C : 0xC9), "");
+static_assert(bit_cast<unsigned short>(packed_vec_t<4, 4>{3, -5, -1, 7}) == (LITTLE_END ? 0x7FB3 : 0x3BF7), "");
+
+// expected-error@+2 {{constexpr variable 'bad_packed_vec_2_3_to_char' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type '_BitInt(2) __attribute__((ext_vector_type(3)))' (vector of 3 '_BitInt(2)' values) is not allowed in a constant expression; element size 2 * element count 3 is not a multiple of the byte size 8}}
+constexpr unsigned char bad_packed_vec_2_3_to_char = __builtin_bit_cast(unsigned char, packed_vec_t<2, 3>{1, 0, 1});
+// expected-error@+2 {{constexpr variable 'bad_char_to_packed_vec_2_3' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type '_BitInt(2) __attribute__((ext_vector_type(3)))' (vector of 3 '_BitInt(2)' values) is not allowed in a constant expression; element size 2 * element count 3 is not a multiple of the byte size 8}}
+constexpr packed_vec_t<2, 3> bad_char_to_packed_vec_2_3 = __builtin_bit_cast(packed_vec_t<2, 3>, static_cast<unsigned char>(0));
+
+// expected-error@+2 {{constexpr variable 'bad_packed_vec_2_6_to_short' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type '_BitInt(2) __attribute__((ext_vector_type(6)))' (vector of 6 '_BitInt(2)' values) is not allowed in a constant expression; element size 2 * element count 6 is not a multiple of the byte size 8}}
+constexpr unsigned short bad_packed_vec_2_6_to_short = __builtin_bit_cast(unsigned short, packed_vec_t<2, 6>{1, 0, 1});
+// expected-error@+2 {{constexpr variable 'bad_short_to_packed_vec_2_6' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type '_BitInt(2) __attribute__((ext_vector_type(6)))' (vector of 6 '_BitInt(2)' values) is not allowed in a constant expression; element size 2 * element count 6 is not a multiple of the byte size 8}}
+constexpr packed_vec_t<2, 6> bad_short_to_packed_vec_2_6 = __builtin_bit_cast(packed_vec_t<2, 6>, static_cast<unsigned short>(0));
+
+// expected-error@+2 {{constexpr variable 'bad_packed_vec_4_3_to_short' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type '_BitInt(4) __attribute__((ext_vector_type(3)))' (vector of 3 '_BitInt(4)' values) is not allowed in a constant expression; element size 4 * element count 3 is not a multiple of the byte size 8}}
+constexpr unsigned short bad_packed_vec_4_3_to_short = __builtin_bit_cast(unsigned short, packed_vec_t<4, 3>{1, 0, 1});
+// expected-error@+2 {{constexpr variable 'bad_short_to_packed_vec_4_3' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type '_BitInt(4) __attribute__((ext_vector_type(3)))' (vector of 3 '_BitInt(4)' values) is not allowed in a constant expression; element size 4 * element count 3 is not a multiple of the byte size 8}}
+constexpr packed_vec_t<4, 3> bad_short_to_packed_vec_4_3 = __builtin_bit_cast(packed_vec_t<4, 3>, static_cast<unsigned short>(0));
+
+// expected-error@+2 {{constexpr variable 'bad_packed_vec_4_5_to_int' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type '_BitInt(4) __attribute__((ext_vector_type(5)))' (vector of 5 '_BitInt(4)' values) is not allowed in a constant expression; element size 4 * element count 5 is not a multiple of the byte size 8}}
+constexpr unsigned int bad_packed_vec_4_5_to_int = __builtin_bit_cast(unsigned int, packed_vec_t<4, 5>{1, 0, 1});
+// expected-error@+2 {{constexpr variable 'bad_int_to_packed_vec_4_5' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type '_BitInt(4) __attribute__((ext_vector_type(5)))' (vector of 5 '_BitInt(4)' values) is not allowed in a constant expression; element size 4 * element count 5 is not a multiple of the byte size 8}}
+constexpr packed_vec_t<4, 5> bad_int_to_packed_vec_4_5 = __builtin_bit_cast(packed_vec_t<4, 5>, static_cast<unsigned int>(0));
+
 }
 
 namespace test_complex {
diff --git a/clang/test/SemaCXX/ext-int.cpp b/clang/test/SemaCXX/ext-int.cpp
index 5c566dafed931..d3b72761402d0 100644
--- a/clang/test/SemaCXX/ext-int.cpp
+++ b/clang/test/SemaCXX/ext-int.cpp
@@ -293,3 +293,33 @@ void FromPaper1() {
 void FromPaper2(_BitInt(8) a1, _BitInt(24) a2) {
   static_assert(is_same<decltype(a1 * (_BitInt(32))a2), _BitInt(32)>::value, "");
 }
+
+// Check sub-byte integer vector size and alignment, expecting packing.
+template <int Bits, int N>
+using packed_vec_t = _BitInt(Bits) __attribute__((ext_vector_type(N)));
+void SubByteVecPacking() {
+  static_assert(sizeof(packed_vec_t<2, 2>) == 1);
+  static_assert(sizeof(packed_vec_t<2, 3>) == 1);
+  static_assert(sizeof(packed_vec_t<2, 4>) == 1);
+  static_assert(sizeof(packed_vec_t<2, 8>) == 2);
+  static_assert(sizeof(packed_vec_t<2, 16>) == 4);
+  static_assert(sizeof(packed_vec_t<2, 32>) == 8);
+  static_assert(sizeof(packed_vec_t<4, 2>) == 1);
+  static_assert(sizeof(packed_vec_t<4, 4>) == 2);
+  static_assert(sizeof(packed_vec_t<4, 8>) == 4);
+  static_assert(sizeof(packed_vec_t<4, 16>) == 8);
+  static_assert(sizeof(packed_vec_t<4, 32>) == 16);
+
+  static_assert(alignof(packed_vec_t<2, 2>) == 1);
+  static_assert(alignof(packed_vec_t<2, 3>) == 1);
+  static_assert(alignof(packed_vec_t<2, 4>) == 1);
+  static_assert(alignof(packed_vec_t<2, 8>) == 2);
+  static_assert(alignof(packed_vec_t<2, 16>) == 4);
+  static_assert(alignof(packed_vec_t<2, 32>) == 8);
+  static_assert(alignof(packed_vec_t<4, 2>) == 1);
+  static_assert(alignof(packed_vec_t<4, 3>) == 2);
+  static_assert(alignof(packed_vec_t<4, 4>) == 2);
+  static_assert(alignof(packed_vec_t<4, 8>) == 4);
+  static_assert(alignof(packed_vec_t<4, 16>) == 8);
+  static_assert(alignof(packed_vec_t<4, 32>) == 16);
+}