@@ -8,16 +8,14 @@ define void @fneg_2x2(ptr %in, ptr %out) {
88; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
99; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[COL_LOAD]]
1010; CHECK-NEXT: [[TMP2:%.*]] = fneg <2 x float> [[COL_LOAD1]]
11- ; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
11+ ; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
1212; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
13- ; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 8
13+ ; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 4
1414; CHECK-NEXT: ret void
1515;
1616 %inv = load <4 x float >, ptr %in
1717 %op = fneg <4 x float > %inv
18- %opt = call <4 x float > @llvm.matrix.transpose (<4 x float > %op , i32 2 , i32 2 )
19- %optt = call <4 x float > @llvm.matrix.transpose (<4 x float > %opt , i32 2 , i32 2 )
20- store <4 x float > %optt , ptr %out
18+ call void @llvm.matrix.column.major.store (<4 x float > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
2119 ret void
2220}
2321
@@ -28,16 +26,14 @@ define void @trunc_2x2(ptr %in, ptr %out) {
2826; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
2927; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i64> [[COL_LOAD]] to <2 x i32>
3028; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i64> [[COL_LOAD1]] to <2 x i32>
31- ; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
29+ ; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
3230; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
33- ; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
31+ ; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
3432; CHECK-NEXT: ret void
3533;
3634 %inv = load <4 x i64 >, ptr %in
3735 %op = trunc <4 x i64 > %inv to <4 x i32 >
38- %opt = call <4 x i32 > @llvm.matrix.transpose (<4 x i32 > %op , i32 2 , i32 2 )
39- %optt = call <4 x i32 > @llvm.matrix.transpose (<4 x i32 > %opt , i32 2 , i32 2 )
40- store <4 x i32 > %optt , ptr %out
36+ call void @llvm.matrix.column.major.store (<4 x i32 > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
4137 ret void
4238}
4339
@@ -48,16 +44,14 @@ define void @zext_2x2(ptr %in, ptr %out) {
4844; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i16>, ptr [[VEC_GEP]], align 4
4945; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[COL_LOAD]] to <2 x i32>
5046; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i16> [[COL_LOAD1]] to <2 x i32>
51- ; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
47+ ; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
5248; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
53- ; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
49+ ; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
5450; CHECK-NEXT: ret void
5551;
5652 %inv = load <4 x i16 >, ptr %in
5753 %op = zext <4 x i16 > %inv to <4 x i32 >
58- %opt = call <4 x i32 > @llvm.matrix.transpose (<4 x i32 > %op , i32 2 , i32 2 )
59- %optt = call <4 x i32 > @llvm.matrix.transpose (<4 x i32 > %opt , i32 2 , i32 2 )
60- store <4 x i32 > %optt , ptr %out
54+ call void @llvm.matrix.column.major.store (<4 x i32 > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
6155 ret void
6256}
6357
@@ -68,16 +62,14 @@ define void @sext_2x2(ptr %in, ptr %out) {
6862; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i8>, ptr [[VEC_GEP]], align 2
6963; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i8> [[COL_LOAD]] to <2 x i16>
7064; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[COL_LOAD1]] to <2 x i16>
71- ; CHECK-NEXT: store <2 x i16> [[TMP1]], ptr [[OUT:%.*]], align 8
65+ ; CHECK-NEXT: store <2 x i16> [[TMP1]], ptr [[OUT:%.*]], align 2
7266; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i16, ptr [[OUT]], i64 2
73- ; CHECK-NEXT: store <2 x i16> [[TMP2]], ptr [[VEC_GEP2]], align 4
67+ ; CHECK-NEXT: store <2 x i16> [[TMP2]], ptr [[VEC_GEP2]], align 2
7468; CHECK-NEXT: ret void
7569;
7670 %inv = load <4 x i8 >, ptr %in
7771 %op = sext <4 x i8 > %inv to <4 x i16 >
78- %opt = call <4 x i16 > @llvm.matrix.transpose (<4 x i16 > %op , i32 2 , i32 2 )
79- %optt = call <4 x i16 > @llvm.matrix.transpose (<4 x i16 > %opt , i32 2 , i32 2 )
80- store <4 x i16 > %optt , ptr %out
72+ call void @llvm.matrix.column.major.store (<4 x i16 > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
8173 ret void
8274}
8375
@@ -88,16 +80,14 @@ define void @fptoui_2x2(ptr %in, ptr %out) {
8880; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
8981; CHECK-NEXT: [[TMP1:%.*]] = fptoui <2 x float> [[COL_LOAD]] to <2 x i32>
9082; CHECK-NEXT: [[TMP2:%.*]] = fptoui <2 x float> [[COL_LOAD1]] to <2 x i32>
91- ; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
83+ ; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
9284; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
93- ; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
85+ ; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
9486; CHECK-NEXT: ret void
9587;
9688 %inv = load <4 x float >, ptr %in
9789 %op = fptoui <4 x float > %inv to <4 x i32 >
98- %opt = call <4 x i32 > @llvm.matrix.transpose (<4 x i32 > %op , i32 2 , i32 2 )
99- %optt = call <4 x i32 > @llvm.matrix.transpose (<4 x i32 > %opt , i32 2 , i32 2 )
100- store <4 x i32 > %optt , ptr %out
90+ call void @llvm.matrix.column.major.store (<4 x i32 > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
10191 ret void
10292}
10393
@@ -108,16 +98,14 @@ define void @fptosi_2x2(ptr %in, ptr %out) {
10898; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
10999; CHECK-NEXT: [[TMP1:%.*]] = fptosi <2 x float> [[COL_LOAD]] to <2 x i32>
110100; CHECK-NEXT: [[TMP2:%.*]] = fptosi <2 x float> [[COL_LOAD1]] to <2 x i32>
111- ; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16
101+ ; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
112102; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
113- ; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 8
103+ ; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
114104; CHECK-NEXT: ret void
115105;
116106 %inv = load <4 x float >, ptr %in
117107 %op = fptosi <4 x float > %inv to <4 x i32 >
118- %opt = call <4 x i32 > @llvm.matrix.transpose (<4 x i32 > %op , i32 2 , i32 2 )
119- %optt = call <4 x i32 > @llvm.matrix.transpose (<4 x i32 > %opt , i32 2 , i32 2 )
120- store <4 x i32 > %optt , ptr %out
108+ call void @llvm.matrix.column.major.store (<4 x i32 > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
121109 ret void
122110}
123111
@@ -128,16 +116,14 @@ define void @uitofp_2x2(ptr %in, ptr %out) {
128116; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
129117; CHECK-NEXT: [[TMP1:%.*]] = uitofp <2 x i64> [[COL_LOAD]] to <2 x double>
130118; CHECK-NEXT: [[TMP2:%.*]] = uitofp <2 x i64> [[COL_LOAD1]] to <2 x double>
131- ; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
119+ ; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 8
132120; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
133- ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
121+ ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 8
134122; CHECK-NEXT: ret void
135123;
136124 %inv = load <4 x i64 >, ptr %in
137125 %op = uitofp <4 x i64 > %inv to <4 x double >
138- %opt = call <4 x double > @llvm.matrix.transpose (<4 x double > %op , i32 2 , i32 2 )
139- %optt = call <4 x double > @llvm.matrix.transpose (<4 x double > %opt , i32 2 , i32 2 )
140- store <4 x double > %optt , ptr %out
126+ call void @llvm.matrix.column.major.store (<4 x double > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
141127 ret void
142128}
143129
@@ -148,16 +134,14 @@ define void @sitofp_2x2(ptr %in, ptr %out) {
148134; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
149135; CHECK-NEXT: [[TMP1:%.*]] = sitofp <2 x i64> [[COL_LOAD]] to <2 x double>
150136; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[COL_LOAD1]] to <2 x double>
151- ; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
137+ ; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 8
152138; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
153- ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
139+ ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 8
154140; CHECK-NEXT: ret void
155141;
156142 %inv = load <4 x i64 >, ptr %in
157143 %op = sitofp <4 x i64 > %inv to <4 x double >
158- %opt = call <4 x double > @llvm.matrix.transpose (<4 x double > %op , i32 2 , i32 2 )
159- %optt = call <4 x double > @llvm.matrix.transpose (<4 x double > %opt , i32 2 , i32 2 )
160- store <4 x double > %optt , ptr %out
144+ call void @llvm.matrix.column.major.store (<4 x double > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
161145 ret void
162146}
163147
@@ -168,16 +152,14 @@ define void @fptrunc_2x2(ptr %in, ptr %out) {
168152; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
169153; CHECK-NEXT: [[TMP1:%.*]] = fptrunc nnan <2 x double> [[COL_LOAD]] to <2 x float>
170154; CHECK-NEXT: [[TMP2:%.*]] = fptrunc nnan <2 x double> [[COL_LOAD1]] to <2 x float>
171- ; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
155+ ; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
172156; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
173- ; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 8
157+ ; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 4
174158; CHECK-NEXT: ret void
175159;
176160 %inv = load <4 x double >, ptr %in
177161 %op = fptrunc nnan <4 x double > %inv to <4 x float >
178- %opt = call <4 x float > @llvm.matrix.transpose (<4 x float > %op , i32 2 , i32 2 )
179- %optt = call <4 x float > @llvm.matrix.transpose (<4 x float > %opt , i32 2 , i32 2 )
180- store <4 x float > %optt , ptr %out
162+ call void @llvm.matrix.column.major.store (<4 x float > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
181163 ret void
182164}
183165
@@ -188,16 +170,14 @@ define void @fpext_2x2(ptr %in, ptr %out) {
188170; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
189171; CHECK-NEXT: [[TMP1:%.*]] = fpext <2 x float> [[COL_LOAD]] to <2 x double>
190172; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[COL_LOAD1]] to <2 x double>
191- ; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
173+ ; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 8
192174; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
193- ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
175+ ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 8
194176; CHECK-NEXT: ret void
195177;
196178 %inv = load <4 x float >, ptr %in
197179 %op = fpext <4 x float > %inv to <4 x double >
198- %opt = call <4 x double > @llvm.matrix.transpose (<4 x double > %op , i32 2 , i32 2 )
199- %optt = call <4 x double > @llvm.matrix.transpose (<4 x double > %opt , i32 2 , i32 2 )
200- store <4 x double > %optt , ptr %out
180+ call void @llvm.matrix.column.major.store (<4 x double > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
201181 ret void
202182}
203183
@@ -208,30 +188,30 @@ define void @bitcast_2x2_v4f64_to_v4i64(ptr %in, ptr %out) {
208188; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
209189; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[COL_LOAD]] to <2 x i64>
210190; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[COL_LOAD1]] to <2 x i64>
211- ; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[OUT:%.*]], align 32
191+ ; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[OUT:%.*]], align 4
212192; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr i64, ptr [[OUT]], i64 2
213- ; CHECK-NEXT: store <2 x i64> [[TMP2]], ptr [[VEC_GEP2]], align 16
193+ ; CHECK-NEXT: store <2 x i64> [[TMP2]], ptr [[VEC_GEP2]], align 4
214194; CHECK-NEXT: ret void
215195;
216196 %inv = load <4 x double >, ptr %in
217197 %op = bitcast <4 x double > %inv to <4 x i64 >
218- %opt = call <4 x i64 > @llvm.matrix.transpose (<4 x i64 > %op , i32 2 , i32 2 )
219- %optt = call <4 x i64 > @llvm.matrix.transpose (<4 x i64 > %opt , i32 2 , i32 2 )
220- store <4 x i64 > %optt , ptr %out
198+ call void @llvm.matrix.column.major.store (<4 x i64 > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
221199 ret void
222200}
223201
224202define void @bitcast_2x2_i256_to_v4i64 (ptr %in , ptr %out ) {
225203; CHECK-LABEL: @bitcast_2x2_i256_to_v4i64(
226204; CHECK-NEXT: [[INV:%.*]] = load i256, ptr [[IN:%.*]], align 4
227205; CHECK-NEXT: [[OP:%.*]] = bitcast i256 [[INV]] to <4 x double>
228- ; CHECK-NEXT: store <4 x double> [[OP]], ptr [[OUT:%.*]], align 32
206+ ; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
207+ ; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
208+ ; CHECK-NEXT: store <2 x double> [[SPLIT]], ptr [[OUT:%.*]], align 8
209+ ; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT]], i64 2
210+ ; CHECK-NEXT: store <2 x double> [[SPLIT1]], ptr [[VEC_GEP]], align 8
229211; CHECK-NEXT: ret void
230212;
231213 %inv = load i256 , ptr %in
232214 %op = bitcast i256 %inv to <4 x double >
233- %opt = call <4 x double > @llvm.matrix.transpose (<4 x double > %op , i32 2 , i32 2 )
234- %optt = call <4 x double > @llvm.matrix.transpose (<4 x double > %opt , i32 2 , i32 2 )
235- store <4 x double > %optt , ptr %out
215+ call void @llvm.matrix.column.major.store (<4 x double > %op , ptr %out , i64 2 , i1 false , i32 2 , i32 2 )
236216 ret void
237217}
0 commit comments