[clang] Fix size and alignment of packed sub-byte integer vectors

steffenlarsen · steffenlarsen · commit 545a9dfc6abc · 2025-10-02T23:30:51.000-07:00
When using sub-byte integer types in vectors, the data is packed into
the first N bits, where N is the bit-size of the sub-byte integer type
multiplied by the number of vector elements. However, currently clang
reports the size as if each element is one byte wide, based on the
element type being considered a single byte wide in separation.

This commit fixes the reported size and alignment of the sub-byte vector
types, so they correspond to the bit-packed layout they employ.

Signed-off-by: Larsen, Steffen &lt;steffen.larsen@intel.com&gt;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
@@ -2093,10 +2093,15 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
   case Type::ExtVector:
   case Type::Vector: {
     const auto *VT = cast<VectorType>(T);
-    TypeInfo EltInfo = getTypeInfo(VT->getElementType());
-    Width = VT->isPackedVectorBoolType(*this)
-                ? VT->getNumElements()
-                : EltInfo.Width * VT->getNumElements();
+    QualType Elt = VT->getElementType();
+    uint64_t EltWidth = [&]() -> uint64_t {
+      if (VT->isPackedVectorBoolType(*this))
+        return 1;
+      if (Elt.getTypePtrOrNull() && Elt.getTypePtr()->isBitIntType())
+        return Elt.getTypePtr()->castAs<BitIntType>()->getNumBits();
+      return getTypeInfo(Elt).Width;
+    }();
+    Width = EltWidth * VT->getNumElements();
     // Enforce at least byte size and alignment.
     Width = std::max<unsigned>(8, Width);
     Align = std::max<unsigned>(8, Width);
diff --git a/clang/test/CodeGenCXX/ext-int.cpp b/clang/test/CodeGenCXX/ext-int.cpp
@@ -573,7 +573,7 @@ void VectorTest(uint16_t4 first, uint16_t4 second) {
 
 typedef unsigned _BitInt(4) uint4_t4 __attribute__((ext_vector_type(4)));
 void VectorTest(uint4_t4 first, uint4_t4 second) {
-  // LIN64: define{{.*}} void @_Z10VectorTestDv4_DU4_S0_(i32 %{{.+}}, i32 %{{.+}})
+  // LIN64: define{{.*}} void @_Z10VectorTestDv4_DU4_S0_(i16 %{{.+}}, i16 %{{.+}})
   // LIN32: define{{.*}} void @_Z10VectorTestDv4_DU4_S0_(<4 x i4> %{{.+}}, <4 x i4> %{{.+}})
   // WIN64: define dso_local void @"?VectorTest@@YAXT?$__vector@U?$_UBitInt@$03@__clang@@$03@__clang@@0@Z"(<4 x i4> %{{.+}}, <4 x i4> %{{.+}})
   // WIN32: define dso_local void @"?VectorTest@@YAXT?$__vector@U?$_UBitInt@$03@__clang@@$03@__clang@@0@Z"(<4 x i4> inreg %{{.+}}, <4 x i4> inreg %{{.+}})
@@ -585,23 +585,25 @@ void VectorTest(uint4_t4 first, uint4_t4 second) {
 
 typedef unsigned _BitInt(2) uint2_t2 __attribute__((ext_vector_type(2)));
 uint2_t2 TestBitIntVector2x2Alloca(uint2_t2 v1, uint2_t2 v2) {
-  // LIN64: define dso_local i16 @_Z25TestBitIntVector2x2AllocaDv2_DU2_S0_(i16 %[[V1Coerce:.+]], i16 %[[V2Coerce:.+]])
-  // LIN64: %[[RetVal:.+]] = alloca <2 x i2>, align 2
-  // LIN64: %[[V1Addr:.+]] = alloca <2 x i2>, align 2
-  // LIN64: %[[V2Addr:.+]] = alloca <2 x i2>, align 2
-  // LIN64: %[[RetValCoerce:.+]] = alloca i16, align 2
-  // LIN64: call void @llvm.memcpy.p0.p0.i64(ptr align 2 %[[RetValCoerce]], ptr align 2 %[[RetVal]], i64 1, i1 false)
-  // LIN64: %[[Ret:.+]] = load i16, ptr %[[RetValCoerce]], align 2
-  // LIN64: ret i16 %[[Ret]]
+  // LIN64: define dso_local i8 @_Z25TestBitIntVector2x2AllocaDv2_DU2_S0_(i8 %[[V1Coerce:.+]], i8 %[[V2Coerce:.+]])
+  // LIN64: %[[RetVal:.+]] = alloca <2 x i2>, align 1
+  // LIN64: %[[V1Addr:.+]] = alloca <2 x i2>, align 1
+  // LIN64: %[[V2Addr:.+]] = alloca <2 x i2>, align 1
+  // LIN64: %[[V1Val:.+]] = load <2 x i2>, ptr %[[V1Addr]], align 1
+  // LIN64: %[[V2Val:.+]] = load <2 x i2>, ptr %[[V2Addr]], align 1
+  // LIN64: %[[AddVal:.+]] = add <2 x i2> %0, %1
+  // LIN64: store <2 x i2> %[[AddVal]], ptr %[[RetVal]], align 1
+  // LIN64: %[[Ret:.+]] = load i8, ptr %[[RetVal]], align 1
+  // LIN64: ret i8 %[[Ret]]
 
   // LIN32: define dso_local <2 x i2> @_Z25TestBitIntVector2x2AllocaDv2_DU2_S0_(<2 x i2> %{{.+}}, <2 x i2> %{{.+}})
-  // LIN32: %[[V1Addr:.+]] = alloca <2 x i2>, align 2
-  // LIN32: %[[V2Addr:.+]] = alloca <2 x i2>, align 2
+  // LIN32: %[[V1Addr:.+]] = alloca <2 x i2>, align 1
+  // LIN32: %[[V2Addr:.+]] = alloca <2 x i2>, align 1
   // LIN32: ret <2 x i2> %[[Ret:.+]]
 
   // WIN: define dso_local <2 x i2> @"?TestBitIntVector2x2Alloca@@YAT?$__vector@U?$_UBitInt@$01@__clang@@$01@__clang@@T12@0@Z"(<2 x i2>{{.*}}, <2 x i2>{{.*}})
-  // WIN: %[[V1:.+]] = alloca <2 x i2>, align 2
-  // WIN: %[[V2:.+]] = alloca <2 x i2>, align 2
+  // WIN: %[[V1:.+]] = alloca <2 x i2>, align 1
+  // WIN: %[[V2:.+]] = alloca <2 x i2>, align 1
   // WIN: ret <2 x i2> %[[Ret:.+]]
   return v1 + v2;
 }
diff --git a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
@@ -70,27 +70,25 @@ i512x3 v3(i512x3 a) {
   return a + a;
 }
 
-// CHECK-LABEL: define dso_local i32 @_Z2v4Dv3_DB4_(
-// CHECK-SAME: i32 [[A_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-LABEL: define dso_local i16 @_Z2v4Dv3_DB4_(
+// CHECK-SAME: i16 [[A_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <3 x i4>, align 4
-// CHECK-NEXT:    [[A:%.*]] = alloca <3 x i4>, align 4
-// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i4>, align 4
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store i32 [[A_COERCE]], ptr [[A]], align 4
-// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i4>, ptr [[A]], align 4
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca <3 x i4>, align 2
+// CHECK-NEXT:    [[A:%.*]] = alloca <3 x i4>, align 2
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x i4>, align 2
+// CHECK-NEXT:    store i16 [[A_COERCE]], ptr [[A]], align 2
+// CHECK-NEXT:    [[LOADVECN:%.*]] = load <4 x i4>, ptr [[A]], align 2
 // CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i4> [[LOADVECN]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i4> [[A1]], <3 x i4> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i4> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
-// CHECK-NEXT:    [[LOADVECN2:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <4 x i4> [[EXTRACTVEC]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[LOADVECN2:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[EXTRACTVEC3:%.*]] = shufflevector <4 x i4> [[LOADVECN2]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[LOADVECN4:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[LOADVECN4:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 2
 // CHECK-NEXT:    [[EXTRACTVEC5:%.*]] = shufflevector <4 x i4> [[LOADVECN4]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
 // CHECK-NEXT:    [[ADD:%.*]] = add <3 x i4> [[EXTRACTVEC3]], [[EXTRACTVEC5]]
-// CHECK-NEXT:    store <3 x i4> [[ADD]], ptr [[RETVAL]], align 4
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RETVAL_COERCE]], ptr align 4 [[RETVAL]], i64 2, i1 false)
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL_COERCE]], align 4
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    store <3 x i4> [[ADD]], ptr [[RETVAL]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[RETVAL]], align 2
+// CHECK-NEXT:    ret i16 [[TMP0]]
 //
 i4x3 v4(i4x3 a) {
   return a + a;
diff --git a/clang/test/SemaCXX/ext-int.cpp b/clang/test/SemaCXX/ext-int.cpp
@@ -293,3 +293,33 @@ void FromPaper1() {
 void FromPaper2(_BitInt(8) a1, _BitInt(24) a2) {
   static_assert(is_same<decltype(a1 * (_BitInt(32))a2), _BitInt(32)>::value, "");
 }
+
+// Check sub-byte integer vector size and alignment, expecting packing.
+template <int Bits, int N>
+using packed_vec_t = _BitInt(Bits) __attribute__((ext_vector_type(N)));
+void SubByteVecPacking() {
+  static_assert(sizeof(packed_vec_t<2, 2>) == 1);
+  static_assert(sizeof(packed_vec_t<2, 3>) == 1);
+  static_assert(sizeof(packed_vec_t<2, 4>) == 1);
+  static_assert(sizeof(packed_vec_t<2, 8>) == 2);
+  static_assert(sizeof(packed_vec_t<2, 16>) == 4);
+  static_assert(sizeof(packed_vec_t<2, 32>) == 8);
+  static_assert(sizeof(packed_vec_t<4, 2>) == 1);
+  static_assert(sizeof(packed_vec_t<4, 4>) == 2);
+  static_assert(sizeof(packed_vec_t<4, 8>) == 4);
+  static_assert(sizeof(packed_vec_t<4, 16>) == 8);
+  static_assert(sizeof(packed_vec_t<4, 32>) == 16);
+
+  static_assert(alignof(packed_vec_t<2, 2>) == 1);
+  static_assert(alignof(packed_vec_t<2, 3>) == 1);
+  static_assert(alignof(packed_vec_t<2, 4>) == 1);
+  static_assert(alignof(packed_vec_t<2, 8>) == 2);
+  static_assert(alignof(packed_vec_t<2, 16>) == 4);
+  static_assert(alignof(packed_vec_t<2, 32>) == 8);
+  static_assert(alignof(packed_vec_t<4, 2>) == 1);
+  static_assert(alignof(packed_vec_t<4, 3>) == 2);
+  static_assert(alignof(packed_vec_t<4, 4>) == 2);
+  static_assert(alignof(packed_vec_t<4, 8>) == 4);
+  static_assert(alignof(packed_vec_t<4, 16>) == 8);
+  static_assert(alignof(packed_vec_t<4, 32>) == 16);
+}