@@ -84,13 +84,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
84
84
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
85
85
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
86
86
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
87
- ; AVX2-NEXT: vmovdqa (%rsi), %xmm1
88
- ; AVX2-NEXT: vmovdqa (%rdx), %xmm2
87
+ ; AVX2-NEXT: vmovdqa (%rdx), %xmm1
88
+ ; AVX2-NEXT: vmovdqa (%r8), %xmm2
89
+ ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
90
+ ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
89
91
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
90
- ; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
91
- ; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
92
- ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
93
- ; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
92
+ ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
93
+ ; AVX2-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
94
94
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
95
95
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
96
96
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -109,13 +109,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
109
109
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
110
110
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
111
111
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
112
- ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
113
- ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
112
+ ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
113
+ ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
114
+ ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
115
+ ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
114
116
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
115
- ; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
116
- ; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
117
- ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
118
- ; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
117
+ ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
118
+ ; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
119
119
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
120
120
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
121
121
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -134,13 +134,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
134
134
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
135
135
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
136
136
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
137
- ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
138
- ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
137
+ ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
138
+ ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
139
+ ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
140
+ ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
139
141
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
140
- ; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
141
- ; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
142
- ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
143
- ; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
142
+ ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
143
+ ; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
144
144
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
145
145
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
146
146
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -159,13 +159,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
159
159
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
160
160
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
161
161
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
162
- ; AVX512-NEXT: vmovdqa (%rsi), %xmm1
163
- ; AVX512-NEXT: vmovdqa (%rdx), %xmm2
162
+ ; AVX512-NEXT: vmovdqa (%rdx), %xmm1
163
+ ; AVX512-NEXT: vmovdqa (%r8), %xmm2
164
+ ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
165
+ ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
164
166
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
165
- ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
166
- ; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
167
- ; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
168
- ; AVX512-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
167
+ ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
168
+ ; AVX512-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
169
169
; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
170
170
; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
171
171
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -184,13 +184,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
184
184
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
185
185
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
186
186
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
187
- ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
188
- ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
187
+ ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
188
+ ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
189
+ ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
190
+ ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
189
191
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
190
- ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
191
- ; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
192
- ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
193
- ; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
192
+ ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
193
+ ; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
194
194
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
195
195
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
196
196
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -209,13 +209,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
209
209
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
210
210
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
211
211
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
212
- ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
213
- ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
212
+ ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
213
+ ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
214
+ ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
215
+ ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
214
216
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
215
- ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
216
- ; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
217
- ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
218
- ; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
217
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
218
+ ; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
219
219
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
220
220
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
221
221
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -234,13 +234,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
234
234
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
235
235
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
236
236
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
237
- ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
238
- ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
237
+ ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
238
+ ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
239
+ ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
240
+ ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
239
241
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
240
- ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
241
- ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
242
- ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
243
- ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
242
+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
243
+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
244
244
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
245
245
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
246
246
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
@@ -259,13 +259,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
259
259
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
260
260
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
261
261
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
262
- ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
263
- ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
262
+ ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
263
+ ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
264
+ ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
265
+ ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
264
266
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
265
- ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
266
- ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
267
- ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
268
- ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
267
+ ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
268
+ ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
269
269
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
270
270
; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
271
271
; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
@@ -280,13 +280,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
280
280
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
281
281
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
282
282
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
283
- ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
284
- ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
283
+ ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
284
+ ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
285
+ ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
286
+ ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
285
287
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
286
- ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
287
- ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
288
- ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
289
- ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
288
+ ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
289
+ ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
290
290
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
291
291
; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
292
292
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
@@ -301,13 +301,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
301
301
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
302
302
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
303
303
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
304
- ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
305
- ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
304
+ ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
305
+ ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
306
+ ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
307
+ ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
306
308
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
307
- ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
308
- ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
309
- ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
310
- ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
309
+ ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
310
+ ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
311
311
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
312
312
; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
313
313
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0
@@ -322,13 +322,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
322
322
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
323
323
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
324
324
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
325
- ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
326
- ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
325
+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
326
+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
327
+ ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
328
+ ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
327
329
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
328
- ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
329
- ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
330
- ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
331
- ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
330
+ ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
331
+ ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1
332
332
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
333
333
; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
334
334
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
0 commit comments