@@ -240,21 +240,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
240240; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
241241; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6
242242; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
243- ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
244- ; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1]
243+ ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
245244; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
246- ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
247- ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
248- ; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1]
245+ ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
249246; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
250- ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
251247; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
252248; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
253249; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx)
254250; AVX512-FCP-NEXT: vmovq %xmm5, (%r8)
255251; AVX512-FCP-NEXT: vmovq %xmm0, (%r9)
256252; AVX512-FCP-NEXT: vmovq %xmm7, (%r10)
257- ; AVX512-FCP-NEXT: vmovq %xmm1 , (%rax)
253+ ; AVX512-FCP-NEXT: vmovq %xmm8 , (%rax)
258254; AVX512-FCP-NEXT: vzeroupper
259255; AVX512-FCP-NEXT: retq
260256;
@@ -309,21 +305,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
309305; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
310306; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6
311307; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
312- ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
313- ; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1]
308+ ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
314309; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
315- ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
316- ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
317- ; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1]
310+ ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
318311; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
319- ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
320312; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
321313; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
322314; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx)
323315; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8)
324316; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9)
325317; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10)
326- ; AVX512DQ-FCP-NEXT: vmovq %xmm1 , (%rax)
318+ ; AVX512DQ-FCP-NEXT: vmovq %xmm8 , (%rax)
327319; AVX512DQ-FCP-NEXT: vzeroupper
328320; AVX512DQ-FCP-NEXT: retq
329321;
@@ -378,21 +370,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
378370; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
379371; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6
380372; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
381- ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
382- ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
373+ ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
383374; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
384- ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
385- ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
386- ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
375+ ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
387376; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
388- ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
389377; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
390378; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
391379; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx)
392380; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8)
393381; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9)
394382; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10)
395- ; AVX512BW-FCP-NEXT: vmovq %xmm1 , (%rax)
383+ ; AVX512BW-FCP-NEXT: vmovq %xmm8 , (%rax)
396384; AVX512BW-FCP-NEXT: vzeroupper
397385; AVX512BW-FCP-NEXT: retq
398386;
@@ -447,21 +435,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
447435; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
448436; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6
449437; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
450- ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
451- ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
438+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
452439; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
453- ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
454- ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
455- ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
440+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
456441; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
457- ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
458442; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
459443; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
460444; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx)
461445; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8)
462446; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9)
463447; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10)
464- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1 , (%rax)
448+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8 , (%rax)
465449; AVX512DQ-BW-FCP-NEXT: vzeroupper
466450; AVX512DQ-BW-FCP-NEXT: retq
467451 %wide.vec = load <14 x i32>, ptr %in.vec, align 64
0 commit comments