diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ca91d35573c3e..b09d33eb20296 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2668,8 +2668,8 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { return C; // canonicalize constant to RHS - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(ISD::ADD, DL, VT, N1, N0); if (areBitwiseNotOfEachother(N0, N1)) @@ -3048,8 +3048,8 @@ SDValue DAGCombiner::visitADDSAT(SDNode *N) { return C; // canonicalize constant to RHS - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(Opcode, DL, VT, N1, N0); // fold vector ops @@ -3306,8 +3306,8 @@ SDValue DAGCombiner::visitADDO(SDNode *N) { DAG.getUNDEF(CarryVT)); // canonicalize constant to RHS. - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0); // fold (addo x, 0) -> x + no carry out @@ -4381,8 +4381,8 @@ SDValue DAGCombiner::visitMULFIX(SDNode *N) { return DAG.getConstant(0, SDLoc(N), VT); // Canonicalize constant to RHS (vector doesn't have to splat) - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Scale); // fold (mulfix x, 0, scale) -> 0 @@ -4410,8 +4410,8 @@ template SDValue DAGCombiner::visitMUL(SDNode *N) { return C; // canonicalize constant to RHS (vector doesn't have to splat) - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return Matcher.getNode(ISD::MUL, DL, VT, N1, N0); bool N1IsConst = false; @@ -5156,8 +5156,8 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) { return C; // canonicalize constant to RHS. - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(ISD::MULHS, DL, N->getVTList(), N1, N0); if (VT.isVector()) { @@ -5215,8 +5215,8 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) { return C; // canonicalize constant to RHS. - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(ISD::MULHU, DL, N->getVTList(), N1, N0); if (VT.isVector()) { @@ -5293,8 +5293,8 @@ SDValue DAGCombiner::visitAVG(SDNode *N) { return C; // canonicalize constant to RHS. - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0); if (VT.isVector()) @@ -5367,8 +5367,8 @@ SDValue DAGCombiner::visitABD(SDNode *N) { return C; // canonicalize constant to RHS. - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0); if (VT.isVector()) @@ -5465,8 +5465,8 @@ SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) { return DAG.getNode(ISD::SMUL_LOHI, DL, N->getVTList(), N0, N1); // canonicalize constant to RHS (vector doesn't have to splat) - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(ISD::SMUL_LOHI, DL, N->getVTList(), N1, N0); // If the type is twice as wide is legal, transform the mulhu to a wider @@ -5506,8 +5506,8 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { return DAG.getNode(ISD::UMUL_LOHI, DL, N->getVTList(), N0, N1); // canonicalize constant to RHS (vector doesn't have to splat) - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(ISD::UMUL_LOHI, DL, N->getVTList(), N1, N0); // (umul_lohi N0, 0) -> (0, 0) @@ -5570,8 +5570,8 @@ SDValue DAGCombiner::visitMULO(SDNode *N) { } // canonicalize constant to RHS. - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0); // fold (mulo x, 0) -> 0 + no carry out @@ -5784,8 +5784,8 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { return N0; // canonicalize constant to RHS - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(Opcode, DL, VT, N1, N0); // fold vector ops @@ -7048,8 +7048,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) { return C; // canonicalize constant to RHS - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(ISD::AND, DL, VT, N1, N0); if (areBitwiseNotOfEachother(N0, N1)) @@ -7945,8 +7945,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) { return C; // canonicalize constant to RHS - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(ISD::OR, DL, VT, N1, N0); // fold vector ops @@ -9501,8 +9501,8 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { return C; // canonicalize constant to RHS - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N0)) && + !DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) return DAG.getNode(ISD::XOR, DL, VT, N1, N0); // fold vector ops diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index 90733dfb8465e..44ab33ad67f27 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -122,7 +122,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; CHECK-LABEL: mul_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-NEXT: vpand %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm3 ; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3 ; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index 6fd3db3464dec..ee83a79b6dd55 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2369,8 +2369,8 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -2391,7 +2391,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -2432,7 +2432,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2450,7 +2450,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2592,8 +2592,8 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm3, %xmm5 ; SSE41-NEXT: pand %xmm2, %xmm5 @@ -2616,7 +2616,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -2659,7 +2659,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2677,7 +2677,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2823,8 +2823,8 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -2846,7 +2846,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -2889,7 +2889,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -2908,7 +2908,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -3054,8 +3054,8 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -3077,7 +3077,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -3120,7 +3120,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3139,7 +3139,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3287,8 +3287,8 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm3, %xmm5 @@ -3311,7 +3311,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 @@ -3356,7 +3356,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3376,7 +3376,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 5a1c4c8a52c82..b4e8f0a230167 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1914,7 +1914,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -1922,7 +1922,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -1944,7 +1944,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -1974,14 +1974,14 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -1999,7 +1999,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2088,7 +2088,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm6, %xmm7, %xmm6 @@ -2096,7 +2096,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2120,7 +2120,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -2150,14 +2150,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -2176,7 +2176,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -2266,7 +2266,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2274,7 +2274,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2297,7 +2297,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2328,14 +2328,14 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -2354,7 +2354,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2444,7 +2444,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2452,7 +2452,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2475,7 +2475,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2506,14 +2506,14 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -2532,7 +2532,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2623,7 +2623,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 ; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 ; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 @@ -2631,7 +2631,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm5 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 @@ -2655,7 +2655,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 @@ -2687,14 +2687,14 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 ; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 ; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 @@ -2714,7 +2714,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 8289e885618f7..9b08d8baacee1 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -892,13 +892,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3 +; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 ; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2 +; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) @@ -913,13 +913,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpand %ymm3, %ymm4, %ymm5 +; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 ; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) -; CHECK-AVX512-NEXT: vpand %ymm2, %ymm4, %ymm3 +; CHECK-AVX512-NEXT: vpand %ymm4, %ymm2, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 @@ -939,13 +939,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3 +; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 ; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2 +; CHECK-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 ; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) @@ -967,7 +967,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-SKX-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] ; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 @@ -980,7 +980,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpandq %zmm1, %zmm2, %zmm3 +; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 ; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 @@ -997,7 +997,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] ; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 6c3d04863118c..fe8a4fa163129 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -161,8 +161,8 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { ; SSE41-LABEL: mul_v16i8: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm3, %xmm4 ; SSE41-NEXT: pand %xmm2, %xmm4 @@ -586,17 +586,16 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: pandn %xmm2, %xmm5 +; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm5, %xmm6 +; SSE41-NEXT: pmaddubsw %xmm2, %xmm6 ; SSE41-NEXT: pand %xmm4, %xmm6 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm2, %xmm5 ; SSE41-NEXT: pand %xmm4, %xmm5 @@ -609,7 +608,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; AVX2-LABEL: mul_v32i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3 ; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 @@ -621,7 +620,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; AVX512F-LABEL: mul_v32i8: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3 ; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 @@ -902,37 +901,34 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pand %xmm4, %xmm9 +; SSE41-NEXT: pandn %xmm4, %xmm9 +; SSE41-NEXT: pand %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm10 -; SSE41-NEXT: pmaddubsw %xmm9, %xmm10 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm10 ; SSE41-NEXT: pand %xmm8, %xmm10 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pandn %xmm4, %xmm9 ; SSE41-NEXT: pmaddubsw %xmm9, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm10, %xmm0 ; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pandn %xmm5, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm9 +; SSE41-NEXT: pmaddubsw %xmm5, %xmm9 ; SSE41-NEXT: pand %xmm8, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm5, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: por %xmm9, %xmm1 ; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pandn %xmm6, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm6 ; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm6, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm6, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm2 ; SSE41-NEXT: psllw $8, %xmm2 ; SSE41-NEXT: por %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pand %xmm7, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm5 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm5 @@ -945,14 +941,14 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; AVX2-LABEL: mul_v64i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 ; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 ; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5 ; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 @@ -963,28 +959,28 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; AVX512F-LABEL: mul_v64i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 ; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm6 ; AVX512F-NEXT: vpmaddubsw %ymm6, %ymm0, %ymm6 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm1 +; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm5, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm3) +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm2) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v64i8: ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll index c9bb3de92dcda..885b07585e68f 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -59,7 +59,7 @@ define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256BW-LABEL: test_mul_32i8: ; AVX256BW: # %bb.0: ; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX256BW-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX256BW-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX256BW-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX256BW-NEXT: vpandn %ymm1, %ymm2, %ymm1 ; AVX256BW-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index efe34c52b3710..d3e4906450e43 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -84,8 +84,8 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-NEXT: pshufb %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pmaddubsw %xmm3, %xmm4 ; SSE-NEXT: pand %xmm2, %xmm4 @@ -120,7 +120,7 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm2 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1