@@ -7,6 +7,8 @@ using i32x3 = _BitInt(32) __attribute__((ext_vector_type(3)));
77using i32x3x3 = _BitInt (32 ) __attribute__((matrix_type(3 , 3 )));
88using i512x3 = _BitInt (512 ) __attribute__((ext_vector_type(3 )));
99using i512x3x3 = _BitInt (512 ) __attribute__((matrix_type(3 , 3 )));
10+ using i4x3 = _BitInt (4 ) __attribute__((ext_vector_type(3 )));
11+ using i4x3x3 = _BitInt (4 ) __attribute__((matrix_type(3 , 3 )));
1012
1113// CHECK-LABEL: define dso_local i32 @_Z2v1Dv3_DB8_(
1214// CHECK-SAME: i32 [[A_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -68,8 +70,34 @@ i512x3 v3(i512x3 a) {
6870 return a + a;
6971}
7072
73+ // CHECK-LABEL: define dso_local i32 @_Z2v4Dv3_DB4_(
74+ // CHECK-SAME: i32 [[A_COERCE:%.*]]) #[[ATTR0]] {
75+ // CHECK-NEXT: [[ENTRY:.*:]]
76+ // CHECK-NEXT: [[RETVAL:%.*]] = alloca <3 x i4>, align 4
77+ // CHECK-NEXT: [[A:%.*]] = alloca <3 x i4>, align 4
78+ // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i4>, align 4
79+ // CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca i32, align 4
80+ // CHECK-NEXT: store i32 [[A_COERCE]], ptr [[A]], align 4
81+ // CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i4>, ptr [[A]], align 4
82+ // CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i4> [[LOADVECN]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
83+ // CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i4> [[A1]], <3 x i4> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
84+ // CHECK-NEXT: store <4 x i4> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
85+ // CHECK-NEXT: [[LOADVECN2:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 4
86+ // CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i4> [[LOADVECN2]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
87+ // CHECK-NEXT: [[LOADVECN4:%.*]] = load <4 x i4>, ptr [[A_ADDR]], align 4
88+ // CHECK-NEXT: [[EXTRACTVEC5:%.*]] = shufflevector <4 x i4> [[LOADVECN4]], <4 x i4> poison, <3 x i32> <i32 0, i32 1, i32 2>
89+ // CHECK-NEXT: [[ADD:%.*]] = add <3 x i4> [[EXTRACTVEC3]], [[EXTRACTVEC5]]
90+ // CHECK-NEXT: store <3 x i4> [[ADD]], ptr [[RETVAL]], align 4
91+ // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RETVAL_COERCE]], ptr align 4 [[RETVAL]], i64 2, i1 false)
92+ // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[RETVAL_COERCE]], align 4
93+ // CHECK-NEXT: ret i32 [[TMP0]]
94+ //
95+ i4x3 v4 (i4x3 a) {
96+ return a + a;
97+ }
98+
7199// CHECK-LABEL: define dso_local noundef <9 x i8> @_Z2m1u11matrix_typeILm3ELm3EDB8_E(
72- // CHECK-SAME: <9 x i8> noundef [[A:%.*]]) #[[ATTR3 :[0-9]+]] {
100+ // CHECK-SAME: <9 x i8> noundef [[A:%.*]]) #[[ATTR4 :[0-9]+]] {
73101// CHECK-NEXT: [[ENTRY:.*:]]
74102// CHECK-NEXT: [[A_ADDR:%.*]] = alloca [9 x i8], align 1
75103// CHECK-NEXT: store <9 x i8> [[A]], ptr [[A_ADDR]], align 1
@@ -83,7 +111,7 @@ i8x3x3 m1(i8x3x3 a) {
83111}
84112
85113// CHECK-LABEL: define dso_local noundef <9 x i32> @_Z2m2u11matrix_typeILm3ELm3EDB32_E(
86- // CHECK-SAME: <9 x i32> noundef [[A:%.*]]) #[[ATTR4 :[0-9]+]] {
114+ // CHECK-SAME: <9 x i32> noundef [[A:%.*]]) #[[ATTR5 :[0-9]+]] {
87115// CHECK-NEXT: [[ENTRY:.*:]]
88116// CHECK-NEXT: [[A_ADDR:%.*]] = alloca [9 x i32], align 4
89117// CHECK-NEXT: store <9 x i32> [[A]], ptr [[A_ADDR]], align 4
@@ -97,7 +125,7 @@ i32x3x3 m2(i32x3x3 a) {
97125}
98126
99127// CHECK-LABEL: define dso_local noundef <9 x i512> @_Z2m3u11matrix_typeILm3ELm3EDB512_E(
100- // CHECK-SAME: <9 x i512> noundef [[A:%.*]]) #[[ATTR5 :[0-9]+]] {
128+ // CHECK-SAME: <9 x i512> noundef [[A:%.*]]) #[[ATTR6 :[0-9]+]] {
101129// CHECK-NEXT: [[ENTRY:.*:]]
102130// CHECK-NEXT: [[A_ADDR:%.*]] = alloca [9 x i512], align 8
103131// CHECK-NEXT: store <9 x i512> [[A]], ptr [[A_ADDR]], align 8
@@ -109,3 +137,17 @@ i32x3x3 m2(i32x3x3 a) {
109137i512x3x3 m3 (i512x3x3 a) {
110138 return a + a;
111139}
140+
141+ // CHECK-LABEL: define dso_local noundef <9 x i4> @_Z2m4u11matrix_typeILm3ELm3EDB4_E(
142+ // CHECK-SAME: <9 x i4> noundef [[A:%.*]]) #[[ATTR7:[0-9]+]] {
143+ // CHECK-NEXT: [[ENTRY:.*:]]
144+ // CHECK-NEXT: [[A_ADDR:%.*]] = alloca [9 x i4], align 1
145+ // CHECK-NEXT: store <9 x i4> [[A]], ptr [[A_ADDR]], align 1
146+ // CHECK-NEXT: [[TMP0:%.*]] = load <9 x i4>, ptr [[A_ADDR]], align 1
147+ // CHECK-NEXT: [[TMP1:%.*]] = load <9 x i4>, ptr [[A_ADDR]], align 1
148+ // CHECK-NEXT: [[TMP2:%.*]] = add <9 x i4> [[TMP0]], [[TMP1]]
149+ // CHECK-NEXT: ret <9 x i4> [[TMP2]]
150+ //
151+ i4x3x3 m4 (i4x3x3 a) {
152+ return a + a;
153+ }
0 commit comments