Skip to content

Conversation

@RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented May 21, 2025

If we're not going to split the v8f32 shuffle anyway, attempt to match with lowerShufflePairAsUNPCKAndPermute

…X1 targets

If we're not going to split the v8f32 shuffle anyway, attempt to match with lowerShufflePairAsUNPCKAndPermute
@llvmbot
Copy link
Member

llvmbot commented May 21, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

If we're not going to split the v8f32 shuffle anyway, attempt to match with lowerShufflePairAsUNPCKAndPermute


Patch is 20.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140881.diff

3 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+5-4)
  • (modified) llvm/test/CodeGen/X86/vector-interleave.ll (+4-9)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll (+126-181)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3d5ef1fc28ec8..977401e0db713 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -16705,10 +16705,11 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Try to match an interleave of two v8f32s and lower them as unpck and
   // permutes using ymms. This needs to go before we try to split the vectors.
-  //
-  // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
-  // this path inadvertently.
-  if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
+  // Don't attempt on AVX1 if we're likely to split vectors anyway.
+  if ((Subtarget.hasAVX2() ||
+       !(isFreeToSplitVector(peekThroughBitcasts(V1), DAG) ||
+         isFreeToSplitVector(peekThroughBitcasts(V2), DAG))) &&
+      !Subtarget.hasAVX512())
     if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
                                                       Mask, DAG))
       return V;
diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll
index 63ca7c6a00573..206f7ed43fd6d 100644
--- a/llvm/test/CodeGen/X86/vector-interleave.ll
+++ b/llvm/test/CodeGen/X86/vector-interleave.ll
@@ -265,15 +265,10 @@ define <16 x i32> @interleave2x8i32(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX1-LABEL: interleave2x8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm1
-; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: interleave2x8i32:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
index 47526e960328e..c2ce612c33c2e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll
@@ -290,18 +290,14 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve
 ;
 ; AVX-LABEL: store_i32_stride2_vf8:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps (%rsi), %xmm0
-; AVX-NEXT:    vmovaps 16(%rsi), %xmm1
-; AVX-NEXT:    vmovaps (%rdi), %xmm2
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm3
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
-; AVX-NEXT:    vmovaps %ymm0, (%rdx)
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
+; AVX-NEXT:    vmovaps (%rsi), %ymm1
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm1
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3]
+; AVX-NEXT:    vmovaps %ymm0, 32(%rdx)
+; AVX-NEXT:    vmovaps %ymm1, (%rdx)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -466,30 +462,22 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ;
 ; AVX-LABEL: store_i32_stride2_vf16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps (%rsi), %xmm0
-; AVX-NEXT:    vmovaps 16(%rsi), %xmm1
-; AVX-NEXT:    vmovaps 32(%rsi), %xmm2
-; AVX-NEXT:    vmovaps 48(%rsi), %xmm3
-; AVX-NEXT:    vmovaps (%rdi), %xmm4
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm5
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm6
-; AVX-NEXT:    vmovaps 48(%rdi), %xmm7
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm8 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm1, %ymm1
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX-NEXT:    vmovaps %ymm3, 96(%rdx)
-; AVX-NEXT:    vmovaps %ymm2, 64(%rdx)
-; AVX-NEXT:    vmovaps %ymm0, (%rdx)
-; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
+; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX-NEXT:    vmovaps (%rsi), %ymm2
+; AVX-NEXT:    vmovaps 32(%rsi), %ymm3
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm2
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm3
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3]
+; AVX-NEXT:    vmovaps %ymm1, 96(%rdx)
+; AVX-NEXT:    vmovaps %ymm0, 32(%rdx)
+; AVX-NEXT:    vmovaps %ymm3, 64(%rdx)
+; AVX-NEXT:    vmovaps %ymm2, (%rdx)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -732,54 +720,38 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ;
 ; AVX-LABEL: store_i32_stride2_vf32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps 112(%rsi), %xmm0
-; AVX-NEXT:    vmovaps 112(%rdi), %xmm1
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps 96(%rsi), %xmm1
-; AVX-NEXT:    vmovaps 96(%rdi), %xmm2
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX-NEXT:    vmovaps 64(%rsi), %xmm2
-; AVX-NEXT:    vmovaps 64(%rdi), %xmm3
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX-NEXT:    vmovaps 80(%rsi), %xmm3
-; AVX-NEXT:    vmovaps 80(%rdi), %xmm4
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX-NEXT:    vmovaps (%rsi), %xmm4
-; AVX-NEXT:    vmovaps 16(%rsi), %xmm5
-; AVX-NEXT:    vmovaps 32(%rsi), %xmm6
-; AVX-NEXT:    vmovaps 48(%rsi), %xmm7
-; AVX-NEXT:    vmovaps (%rdi), %xmm8
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm9
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm10
-; AVX-NEXT:    vmovaps 48(%rdi), %xmm11
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm4, %ymm4
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm6[2],xmm10[3],xmm6[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm6, %ymm6
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm7, %ymm7
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm5, %ymm5
-; AVX-NEXT:    vmovaps %ymm5, 32(%rdx)
-; AVX-NEXT:    vmovaps %ymm7, 96(%rdx)
-; AVX-NEXT:    vmovaps %ymm6, 64(%rdx)
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
+; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 64(%rdi), %ymm2
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm3
+; AVX-NEXT:    vmovaps (%rsi), %ymm4
+; AVX-NEXT:    vmovaps 32(%rsi), %ymm5
+; AVX-NEXT:    vmovaps 64(%rsi), %ymm6
+; AVX-NEXT:    vmovaps 96(%rsi), %ymm7
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm8 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm4
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm9 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm9, %ymm5
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm10 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm10, %ymm6
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[4],ymm7[4],ymm3[5],ymm7[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[6],ymm7[6],ymm3[7],ymm7[7]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm11, %ymm7
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[2,3]
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3]
+; AVX-NEXT:    vmovaps %ymm3, 224(%rdx)
+; AVX-NEXT:    vmovaps %ymm2, 160(%rdx)
+; AVX-NEXT:    vmovaps %ymm1, 96(%rdx)
+; AVX-NEXT:    vmovaps %ymm0, 32(%rdx)
+; AVX-NEXT:    vmovaps %ymm7, 192(%rdx)
+; AVX-NEXT:    vmovaps %ymm6, 128(%rdx)
+; AVX-NEXT:    vmovaps %ymm5, 64(%rdx)
 ; AVX-NEXT:    vmovaps %ymm4, (%rdx)
-; AVX-NEXT:    vmovaps %ymm3, 160(%rdx)
-; AVX-NEXT:    vmovaps %ymm2, 128(%rdx)
-; AVX-NEXT:    vmovaps %ymm1, 192(%rdx)
-; AVX-NEXT:    vmovaps %ymm0, 224(%rdx)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -1216,106 +1188,79 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v
 ;
 ; AVX-LABEL: store_i32_stride2_vf64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps 240(%rsi), %xmm0
-; AVX-NEXT:    vmovaps 240(%rdi), %xmm1
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 224(%rsi), %xmm1
-; AVX-NEXT:    vmovaps 224(%rdi), %xmm2
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm0
-; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT:    vmovaps 128(%rsi), %xmm2
-; AVX-NEXT:    vmovaps 128(%rdi), %xmm3
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX-NEXT:    vmovaps 144(%rsi), %xmm3
-; AVX-NEXT:    vmovaps 144(%rdi), %xmm4
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX-NEXT:    vmovaps (%rsi), %xmm4
-; AVX-NEXT:    vmovaps 16(%rsi), %xmm8
-; AVX-NEXT:    vmovaps 32(%rsi), %xmm5
-; AVX-NEXT:    vmovaps 48(%rsi), %xmm11
-; AVX-NEXT:    vmovaps (%rdi), %xmm6
-; AVX-NEXT:    vmovaps 16(%rdi), %xmm9
-; AVX-NEXT:    vmovaps 32(%rdi), %xmm7
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm4, %ymm4
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX-NEXT:    vmovaps 64(%rsi), %xmm6
-; AVX-NEXT:    vmovaps 64(%rdi), %xmm7
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm6, %ymm6
-; AVX-NEXT:    vmovaps 96(%rsi), %xmm7
-; AVX-NEXT:    vmovaps 96(%rdi), %xmm10
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm12 = xmm10[2],xmm7[2],xmm10[3],xmm7[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm7, %ymm7
-; AVX-NEXT:    vmovaps 160(%rsi), %xmm10
-; AVX-NEXT:    vmovaps 160(%rdi), %xmm12
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm13 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm10, %ymm10
-; AVX-NEXT:    vmovaps 192(%rsi), %xmm12
-; AVX-NEXT:    vmovaps 192(%rdi), %xmm13
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm12, %ymm12
-; AVX-NEXT:    vmovaps 80(%rsi), %xmm13
-; AVX-NEXT:    vmovaps 80(%rdi), %xmm14
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
-; AVX-NEXT:    vmovaps 48(%rdi), %xmm14
-; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm13, %ymm13
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm11[2],xmm14[3],xmm11[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm11, %ymm11
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm14 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm8, %ymm8
-; AVX-NEXT:    vmovaps 112(%rsi), %xmm9
-; AVX-NEXT:    vmovaps 112(%rdi), %xmm14
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm9[2],xmm14[3],xmm9[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm9, %ymm9
-; AVX-NEXT:    vmovaps 176(%rsi), %xmm14
-; AVX-NEXT:    vmovaps 176(%rdi), %xmm15
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm14, %ymm1
-; AVX-NEXT:    vmovaps 208(%rsi), %xmm14
-; AVX-NEXT:    vmovaps 208(%rdi), %xmm15
-; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX-NEXT:    vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm14, %ymm0
-; AVX-NEXT:    vmovaps %ymm0, 416(%rdx)
-; AVX-NEXT:    vmovaps %ymm1, 352(%rdx)
-; AVX-NEXT:    vmovaps %ymm9, 224(%rdx)
-; AVX-NEXT:    vmovaps %ymm8, 32(%rdx)
-; AVX-NEXT:    vmovaps %ymm11, 96(%rdx)
-; AVX-NEXT:    vmovaps %ymm13, 160(%rdx)
-; AVX-NEXT:    vmovaps %ymm12, 384(%rdx)
-; AVX-NEXT:    vmovaps %ymm10, 320(%rdx)
-; AVX-NEXT:    vmovaps %ymm7, 192(%rdx)
-; AVX-NEXT:    vmovaps %ymm6, 128(%rdx)
-; AVX-NEXT:    vmovaps %ymm5, 64(%rdx)
-; AVX-NEXT:    vmovaps %ymm4, (%rdx)
-; AVX-NEXT:    vmovaps %ymm3, 288(%rdx)
-; AVX-NEXT:    vmovaps %ymm2, 256(%rdx)
-; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT:    vmovaps %ymm0, 448(%rdx)
+; AVX-NEXT:    vmovaps 224(%rdi), %ymm0
+; AVX-NEXT:    vmovaps 192(%rdi), %ymm1
+; AVX-NEXT:    vmovaps 160(%rdi), %ymm2
+; AVX-NEXT:    vmovaps 128(%rdi), %ymm5
+; AVX-NEXT:    vmovaps (%rdi), %ymm4
+; AVX-NEXT:    vmovaps 32(%rdi), %ymm6
+; AVX-NEXT:    vmovaps 64(%rdi), %ymm7
+; AVX-NEXT:    vmovaps 96(%rdi), %ymm8
+; AVX-NEXT:    vmovaps 192(%rsi), %ymm9
+; AVX-NEXT:    vmovaps 160(%rsi), %ymm10
+; AVX-NEXT:    vmovaps 128(%rsi), %ymm11
+; AVX-NEXT:    vmovaps (%rsi), %ymm12
+; AVX-NEXT:    vmovaps 32(%rsi), %ymm13
+; AVX-NEXT:    vmovaps 64(%rsi), %ymm14
+; AVX-NEXT:    vmovaps 96(%rsi), %ymm15
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm4[0],ymm12[0],ymm4[1],ymm12[1],ymm4[4],ymm12[4],ymm4[5],ymm12[5]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm12[2],ymm4[3],ymm12[3],ymm4[6],ymm12[6],ymm4[7],ymm12[7]
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[4],ymm13[4],ymm6[5],ymm13[5]
+; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm12 = ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[6],ymm13[6],ymm6[7],ymm13[7]
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm13 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[4],ymm14[4],ymm7[5],ymm14[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm7 = ymm7[2],ymm14[2],ymm7[3],ymm14[3],ymm7[6],ymm14[6],ymm7[7],ymm14[7]
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[4],ymm15[4],ymm8[5],ymm15[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm15[2],ymm8[3],ymm15[3],ymm8[6],ymm15[6],ymm8[7],ymm15[7]
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm5[0],ymm11[0],ymm5[1],ymm11[1],ymm5[4],ymm11[4],ymm5[5],ymm11[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7]
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[4],ymm10[4],ymm2[5],ymm10[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7]
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm10 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7]
+; AVX-NEXT:    vmovaps 224(%rsi), %ymm9
+; AVX-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm9[0],ymm0[1],ymm9[1],ymm0[4],ymm9[4],ymm0[5],ymm9[5]
+; AVX-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7]
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm9, %ymm4
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm4
+; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm12[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm13, %ymm12
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm14, %ymm13
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm14[2,3],ymm8[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm15, %ymm14
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm11, %ymm15
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm11[2,3],ymm5[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm10, %ymm11
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm10...
[truncated]

@RKSimon RKSimon merged commit 4091b5d into llvm:main May 21, 2025
13 checks passed
@RKSimon RKSimon deleted the x86-interleave-v8f32-avx1 branch May 21, 2025 12:00
@llvm-ci
Copy link
Collaborator

llvm-ci commented May 21, 2025

LLVM Buildbot has detected a new failure on builder lldb-x86_64-debian running on lldb-x86_64-debian while building llvm at step 6 "test".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/162/builds/22845

Here is the relevant piece of the build log for the reference
Step 6 (test) failure: build (failure)
...
PASS: lldb-unit :: Utility/./UtilityTests/94/129 (2942 of 2953)
UNSUPPORTED: lldb-shell :: ScriptInterpreter/Lua/partial_statements.test (2943 of 2953)
UNSUPPORTED: lldb-shell :: Watchpoint/LocalVariableWatchpointDisabler.test (2944 of 2953)
UNSUPPORTED: lldb-shell :: Heap/heap-cstr.test (2945 of 2953)
UNSUPPORTED: lldb-shell :: ScriptInterpreter/Lua/breakpoint_function_callback.test (2946 of 2953)
UNSUPPORTED: lldb-shell :: ScriptInterpreter/Lua/lua.test (2947 of 2953)
UNSUPPORTED: lldb-shell :: Register/riscv64-gp-read.test (2948 of 2953)
PASS: lldb-api :: api/multithreaded/TestMultithreaded.py (2949 of 2953)
PASS: lldb-api :: terminal/TestEditlineCompletions.py (2950 of 2953)
UNRESOLVED: lldb-api :: tools/lldb-dap/launch/TestDAP_launch.py (2951 of 2953)
******************** TEST 'lldb-api :: tools/lldb-dap/launch/TestDAP_launch.py' FAILED ********************
Script:
--
/usr/bin/python3 /home/worker/2.0.1/lldb-x86_64-debian/llvm-project/lldb/test/API/dotest.py -u CXXFLAGS -u CFLAGS --env LLVM_LIBS_DIR=/home/worker/2.0.1/lldb-x86_64-debian/build/./lib --env LLVM_INCLUDE_DIR=/home/worker/2.0.1/lldb-x86_64-debian/build/include --env LLVM_TOOLS_DIR=/home/worker/2.0.1/lldb-x86_64-debian/build/./bin --arch x86_64 --build-dir /home/worker/2.0.1/lldb-x86_64-debian/build/lldb-test-build.noindex --lldb-module-cache-dir /home/worker/2.0.1/lldb-x86_64-debian/build/lldb-test-build.noindex/module-cache-lldb/lldb-api --clang-module-cache-dir /home/worker/2.0.1/lldb-x86_64-debian/build/lldb-test-build.noindex/module-cache-clang/lldb-api --executable /home/worker/2.0.1/lldb-x86_64-debian/build/./bin/lldb --compiler /home/worker/2.0.1/lldb-x86_64-debian/build/./bin/clang --dsymutil /home/worker/2.0.1/lldb-x86_64-debian/build/./bin/dsymutil --make /usr/bin/gmake --llvm-tools-dir /home/worker/2.0.1/lldb-x86_64-debian/build/./bin --lldb-obj-root /home/worker/2.0.1/lldb-x86_64-debian/build/tools/lldb --lldb-libs-dir /home/worker/2.0.1/lldb-x86_64-debian/build/./lib -t /home/worker/2.0.1/lldb-x86_64-debian/llvm-project/lldb/test/API/tools/lldb-dap/launch -p TestDAP_launch.py
--
Exit Code: 1

Command Output (stdout):
--
lldb version 21.0.0git (https://github.com/llvm/llvm-project.git revision 4091b5d9ec985e1b2b5fc1810c6bfeff7800089b)
  clang revision 4091b5d9ec985e1b2b5fc1810c6bfeff7800089b
  llvm revision 4091b5d9ec985e1b2b5fc1810c6bfeff7800089b
Skipping the following test categories: ['libc++', 'dsym', 'gmodules', 'debugserver', 'objc']

--
Command Output (stderr):
--
Change dir to: /home/worker/2.0.1/lldb-x86_64-debian/llvm-project/lldb/test/API/tools/lldb-dap/launch
runCmd: settings clear --all

output: 

runCmd: settings set symbols.enable-external-lookup false

output: 

runCmd: settings set target.inherit-tcc true

output: 

runCmd: settings set target.disable-aslr false

output: 

runCmd: settings set target.detach-on-error false

output: 

runCmd: settings set target.auto-apply-fixits false

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants