Skip to content

Commit 3c89e17

Browse files
RKSimonmemfrob
authored andcommitted
[X86] decomposeMulByConstant - decompose legal vXi32 multiplies on SlowPMULLD targets and all vXi64 multiplies
X86's decomposeMulByConstant never permits mul decomposition to shift+add/sub if the vector multiply is legal. Unfortunately this isn't great for SSE41+ targets which have PMULLD for vXi32 multiplies, but is often quite slow. This patch proposes to allow decomposition if the target has the SlowPMULLD flag (i.e. Silvermont). We also always decompose legal vXi64 multiplies - even latest IceLake has really poor latencies for PMULLQ. Differential Revision: https://reviews.llvm.org/D110588
1 parent 4032728 commit 3c89e17

File tree

2 files changed

+138
-104
lines changed

2 files changed

+138
-104
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5533,10 +5533,13 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
55335533
VT = getTypeToTransformTo(Context, VT);
55345534

55355535
// If vector multiply is legal, assume that's faster than shl + add/sub.
5536-
// TODO: Multiply is a complex op with higher latency and lower throughput in
5537-
// most implementations, so this check could be loosened based on type
5538-
// and/or a CPU attribute.
5539-
if (isOperationLegal(ISD::MUL, VT))
5536+
// Multiply is a complex op with higher latency and lower throughput in
5537+
// most implementations, sub-vXi32 vector multiplies are always fast,
5538+
// vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
5539+
// is always going to be slow.
5540+
unsigned EltSizeInBits = VT.getScalarSizeInBits();
5541+
if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
5542+
(EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
55405543
return false;
55415544

55425545
// shl+add, shl+sub, shl+add+neg

llvm/test/CodeGen/X86/vector-mul.ll

Lines changed: 131 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X86-SSE,X86-SSE2
33
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X86-SSE,X86-SSE4
44
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X64-SSE,X64-SSE2
5-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,-slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4
6-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,-slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-FAST
6+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-SLOW
77
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=X64-AVX,X64-XOP
88
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
99
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512DQ
@@ -264,22 +264,11 @@ define <2 x i64> @mul_v2i64_17(<2 x i64> %a0) nounwind {
264264
; SSE-NEXT: movdqa %xmm1, %xmm0
265265
; SSE-NEXT: ret{{[l|q]}}
266266
;
267-
; X64-XOP-LABEL: mul_v2i64_17:
268-
; X64-XOP: # %bb.0:
269-
; X64-XOP-NEXT: vpsllq $4, %xmm0, %xmm1
270-
; X64-XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
271-
; X64-XOP-NEXT: retq
272-
;
273-
; X64-AVX2-LABEL: mul_v2i64_17:
274-
; X64-AVX2: # %bb.0:
275-
; X64-AVX2-NEXT: vpsllq $4, %xmm0, %xmm1
276-
; X64-AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0
277-
; X64-AVX2-NEXT: retq
278-
;
279-
; X64-AVX512DQ-LABEL: mul_v2i64_17:
280-
; X64-AVX512DQ: # %bb.0:
281-
; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
282-
; X64-AVX512DQ-NEXT: retq
267+
; X64-AVX-LABEL: mul_v2i64_17:
268+
; X64-AVX: # %bb.0:
269+
; X64-AVX-NEXT: vpsllq $4, %xmm0, %xmm1
270+
; X64-AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
271+
; X64-AVX-NEXT: retq
283272
%1 = mul <2 x i64> %a0, <i64 17, i64 17>
284273
ret <2 x i64> %1
285274
}
@@ -298,10 +287,18 @@ define <4 x i32> @mul_v4i32_17(<4 x i32> %a0) nounwind {
298287
; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
299288
; X86-SSE4-NEXT: retl
300289
;
301-
; X64-SSE4-LABEL: mul_v4i32_17:
302-
; X64-SSE4: # %bb.0:
303-
; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
304-
; X64-SSE4-NEXT: retq
290+
; X64-SSE4-FAST-LABEL: mul_v4i32_17:
291+
; X64-SSE4-FAST: # %bb.0:
292+
; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
293+
; X64-SSE4-FAST-NEXT: retq
294+
;
295+
; X64-SSE4-SLOW-LABEL: mul_v4i32_17:
296+
; X64-SSE4-SLOW: # %bb.0:
297+
; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
298+
; X64-SSE4-SLOW-NEXT: pslld $4, %xmm1
299+
; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm1
300+
; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm0
301+
; X64-SSE4-SLOW-NEXT: retq
305302
;
306303
; X64-XOP-LABEL: mul_v4i32_17:
307304
; X64-XOP: # %bb.0:
@@ -414,7 +411,8 @@ define <4 x i64> @mul_v4i64_17(<4 x i64> %a0) nounwind {
414411
;
415412
; X64-AVX512DQ-LABEL: mul_v4i64_17:
416413
; X64-AVX512DQ: # %bb.0:
417-
; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
414+
; X64-AVX512DQ-NEXT: vpsllq $4, %ymm0, %ymm1
415+
; X64-AVX512DQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0
418416
; X64-AVX512DQ-NEXT: retq
419417
%1 = mul <4 x i64> %a0, <i64 17, i64 17, i64 17, i64 17>
420418
ret <4 x i64> %1
@@ -433,12 +431,31 @@ define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
433431
; SSE2-NEXT: movdqa %xmm3, %xmm1
434432
; SSE2-NEXT: ret{{[l|q]}}
435433
;
436-
; SSE4-LABEL: mul_v8i32_17:
437-
; SSE4: # %bb.0:
438-
; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17]
439-
; SSE4-NEXT: pmulld %xmm2, %xmm0
440-
; SSE4-NEXT: pmulld %xmm2, %xmm1
441-
; SSE4-NEXT: ret{{[l|q]}}
434+
; X86-SSE4-LABEL: mul_v8i32_17:
435+
; X86-SSE4: # %bb.0:
436+
; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17]
437+
; X86-SSE4-NEXT: pmulld %xmm2, %xmm0
438+
; X86-SSE4-NEXT: pmulld %xmm2, %xmm1
439+
; X86-SSE4-NEXT: retl
440+
;
441+
; X64-SSE4-FAST-LABEL: mul_v8i32_17:
442+
; X64-SSE4-FAST: # %bb.0:
443+
; X64-SSE4-FAST-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17]
444+
; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm0
445+
; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm1
446+
; X64-SSE4-FAST-NEXT: retq
447+
;
448+
; X64-SSE4-SLOW-LABEL: mul_v8i32_17:
449+
; X64-SSE4-SLOW: # %bb.0:
450+
; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm2
451+
; X64-SSE4-SLOW-NEXT: pslld $4, %xmm2
452+
; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm2
453+
; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm3
454+
; X64-SSE4-SLOW-NEXT: pslld $4, %xmm3
455+
; X64-SSE4-SLOW-NEXT: paddd %xmm1, %xmm3
456+
; X64-SSE4-SLOW-NEXT: movdqa %xmm2, %xmm0
457+
; X64-SSE4-SLOW-NEXT: movdqa %xmm3, %xmm1
458+
; X64-SSE4-SLOW-NEXT: retq
442459
;
443460
; X64-XOP-LABEL: mul_v8i32_17:
444461
; X64-XOP: # %bb.0:
@@ -553,26 +570,13 @@ define <2 x i64> @mul_v2i64_neg1025(<2 x i64> %a0) nounwind {
553570
; SSE-NEXT: psubq %xmm1, %xmm0
554571
; SSE-NEXT: ret{{[l|q]}}
555572
;
556-
; X64-XOP-LABEL: mul_v2i64_neg1025:
557-
; X64-XOP: # %bb.0:
558-
; X64-XOP-NEXT: vpsllq $10, %xmm0, %xmm1
559-
; X64-XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
560-
; X64-XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
561-
; X64-XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm0
562-
; X64-XOP-NEXT: retq
563-
;
564-
; X64-AVX2-LABEL: mul_v2i64_neg1025:
565-
; X64-AVX2: # %bb.0:
566-
; X64-AVX2-NEXT: vpsllq $10, %xmm0, %xmm1
567-
; X64-AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0
568-
; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
569-
; X64-AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
570-
; X64-AVX2-NEXT: retq
571-
;
572-
; X64-AVX512DQ-LABEL: mul_v2i64_neg1025:
573-
; X64-AVX512DQ: # %bb.0:
574-
; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
575-
; X64-AVX512DQ-NEXT: retq
573+
; X64-AVX-LABEL: mul_v2i64_neg1025:
574+
; X64-AVX: # %bb.0:
575+
; X64-AVX-NEXT: vpsllq $10, %xmm0, %xmm1
576+
; X64-AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
577+
; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
578+
; X64-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0
579+
; X64-AVX-NEXT: retq
576580
%1 = mul <2 x i64> %a0, <i64 -1025, i64 -1025>
577581
ret <2 x i64> %1
578582
}
@@ -592,10 +596,19 @@ define <4 x i32> @mul_v4i32_neg33(<4 x i32> %a0) nounwind {
592596
; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
593597
; X86-SSE4-NEXT: retl
594598
;
595-
; X64-SSE4-LABEL: mul_v4i32_neg33:
596-
; X64-SSE4: # %bb.0:
597-
; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
598-
; X64-SSE4-NEXT: retq
599+
; X64-SSE4-FAST-LABEL: mul_v4i32_neg33:
600+
; X64-SSE4-FAST: # %bb.0:
601+
; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
602+
; X64-SSE4-FAST-NEXT: retq
603+
;
604+
; X64-SSE4-SLOW-LABEL: mul_v4i32_neg33:
605+
; X64-SSE4-SLOW: # %bb.0:
606+
; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
607+
; X64-SSE4-SLOW-NEXT: pslld $5, %xmm1
608+
; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm1
609+
; X64-SSE4-SLOW-NEXT: pxor %xmm0, %xmm0
610+
; X64-SSE4-SLOW-NEXT: psubd %xmm1, %xmm0
611+
; X64-SSE4-SLOW-NEXT: retq
599612
;
600613
; X64-XOP-LABEL: mul_v4i32_neg33:
601614
; X64-XOP: # %bb.0:
@@ -724,7 +737,10 @@ define <4 x i64> @mul_v4i64_neg1025(<4 x i64> %a0) nounwind {
724737
;
725738
; X64-AVX512DQ-LABEL: mul_v4i64_neg1025:
726739
; X64-AVX512DQ: # %bb.0:
727-
; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
740+
; X64-AVX512DQ-NEXT: vpsllq $10, %ymm0, %ymm1
741+
; X64-AVX512DQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0
742+
; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
743+
; X64-AVX512DQ-NEXT: vpsubq %ymm0, %ymm1, %ymm0
728744
; X64-AVX512DQ-NEXT: retq
729745
%1 = mul <4 x i64> %a0, <i64 -1025, i64 -1025, i64 -1025, i64 -1025>
730746
ret <4 x i64> %1
@@ -746,12 +762,34 @@ define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
746762
; SSE2-NEXT: movdqa %xmm2, %xmm1
747763
; SSE2-NEXT: ret{{[l|q]}}
748764
;
749-
; SSE4-LABEL: mul_v8i32_neg33:
750-
; SSE4: # %bb.0:
751-
; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
752-
; SSE4-NEXT: pmulld %xmm2, %xmm0
753-
; SSE4-NEXT: pmulld %xmm2, %xmm1
754-
; SSE4-NEXT: ret{{[l|q]}}
765+
; X86-SSE4-LABEL: mul_v8i32_neg33:
766+
; X86-SSE4: # %bb.0:
767+
; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
768+
; X86-SSE4-NEXT: pmulld %xmm2, %xmm0
769+
; X86-SSE4-NEXT: pmulld %xmm2, %xmm1
770+
; X86-SSE4-NEXT: retl
771+
;
772+
; X64-SSE4-FAST-LABEL: mul_v8i32_neg33:
773+
; X64-SSE4-FAST: # %bb.0:
774+
; X64-SSE4-FAST-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
775+
; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm0
776+
; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm1
777+
; X64-SSE4-FAST-NEXT: retq
778+
;
779+
; X64-SSE4-SLOW-LABEL: mul_v8i32_neg33:
780+
; X64-SSE4-SLOW: # %bb.0:
781+
; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm3
782+
; X64-SSE4-SLOW-NEXT: pslld $5, %xmm3
783+
; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm3
784+
; X64-SSE4-SLOW-NEXT: pxor %xmm2, %xmm2
785+
; X64-SSE4-SLOW-NEXT: pxor %xmm0, %xmm0
786+
; X64-SSE4-SLOW-NEXT: psubd %xmm3, %xmm0
787+
; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm3
788+
; X64-SSE4-SLOW-NEXT: pslld $5, %xmm3
789+
; X64-SSE4-SLOW-NEXT: paddd %xmm1, %xmm3
790+
; X64-SSE4-SLOW-NEXT: psubd %xmm3, %xmm2
791+
; X64-SSE4-SLOW-NEXT: movdqa %xmm2, %xmm1
792+
; X64-SSE4-SLOW-NEXT: retq
755793
;
756794
; X64-XOP-LABEL: mul_v8i32_neg33:
757795
; X64-XOP: # %bb.0:
@@ -1070,22 +1108,11 @@ define <2 x i64> @mul_v2i64_7(<2 x i64> %a0) nounwind {
10701108
; SSE-NEXT: movdqa %xmm1, %xmm0
10711109
; SSE-NEXT: ret{{[l|q]}}
10721110
;
1073-
; X64-XOP-LABEL: mul_v2i64_7:
1074-
; X64-XOP: # %bb.0:
1075-
; X64-XOP-NEXT: vpsllq $3, %xmm0, %xmm1
1076-
; X64-XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm0
1077-
; X64-XOP-NEXT: retq
1078-
;
1079-
; X64-AVX2-LABEL: mul_v2i64_7:
1080-
; X64-AVX2: # %bb.0:
1081-
; X64-AVX2-NEXT: vpsllq $3, %xmm0, %xmm1
1082-
; X64-AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
1083-
; X64-AVX2-NEXT: retq
1084-
;
1085-
; X64-AVX512DQ-LABEL: mul_v2i64_7:
1086-
; X64-AVX512DQ: # %bb.0:
1087-
; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1088-
; X64-AVX512DQ-NEXT: retq
1111+
; X64-AVX-LABEL: mul_v2i64_7:
1112+
; X64-AVX: # %bb.0:
1113+
; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm1
1114+
; X64-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0
1115+
; X64-AVX-NEXT: retq
10891116
%1 = mul <2 x i64> %a0, <i64 7, i64 7>
10901117
ret <2 x i64> %1
10911118
}
@@ -1104,10 +1131,18 @@ define <4 x i32> @mul_v4i32_7(<4 x i32> %a0) nounwind {
11041131
; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
11051132
; X86-SSE4-NEXT: retl
11061133
;
1107-
; X64-SSE4-LABEL: mul_v4i32_7:
1108-
; X64-SSE4: # %bb.0:
1109-
; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1110-
; X64-SSE4-NEXT: retq
1134+
; X64-SSE4-FAST-LABEL: mul_v4i32_7:
1135+
; X64-SSE4-FAST: # %bb.0:
1136+
; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1137+
; X64-SSE4-FAST-NEXT: retq
1138+
;
1139+
; X64-SSE4-SLOW-LABEL: mul_v4i32_7:
1140+
; X64-SSE4-SLOW: # %bb.0:
1141+
; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
1142+
; X64-SSE4-SLOW-NEXT: pslld $3, %xmm1
1143+
; X64-SSE4-SLOW-NEXT: psubd %xmm0, %xmm1
1144+
; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm0
1145+
; X64-SSE4-SLOW-NEXT: retq
11111146
;
11121147
; X64-XOP-LABEL: mul_v4i32_7:
11131148
; X64-XOP: # %bb.0:
@@ -1201,22 +1236,11 @@ define <2 x i64> @mul_v2i64_neg7(<2 x i64> %a0) nounwind {
12011236
; SSE-NEXT: psubq %xmm1, %xmm0
12021237
; SSE-NEXT: ret{{[l|q]}}
12031238
;
1204-
; X64-XOP-LABEL: mul_v2i64_neg7:
1205-
; X64-XOP: # %bb.0:
1206-
; X64-XOP-NEXT: vpsllq $3, %xmm0, %xmm1
1207-
; X64-XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm0
1208-
; X64-XOP-NEXT: retq
1209-
;
1210-
; X64-AVX2-LABEL: mul_v2i64_neg7:
1211-
; X64-AVX2: # %bb.0:
1212-
; X64-AVX2-NEXT: vpsllq $3, %xmm0, %xmm1
1213-
; X64-AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
1214-
; X64-AVX2-NEXT: retq
1215-
;
1216-
; X64-AVX512DQ-LABEL: mul_v2i64_neg7:
1217-
; X64-AVX512DQ: # %bb.0:
1218-
; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1219-
; X64-AVX512DQ-NEXT: retq
1239+
; X64-AVX-LABEL: mul_v2i64_neg7:
1240+
; X64-AVX: # %bb.0:
1241+
; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm1
1242+
; X64-AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0
1243+
; X64-AVX-NEXT: retq
12201244
%1 = mul <2 x i64> %a0, <i64 -7, i64 -7>
12211245
ret <2 x i64> %1
12221246
}
@@ -1234,10 +1258,17 @@ define <4 x i32> @mul_v4i32_neg63(<4 x i32> %a0) nounwind {
12341258
; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
12351259
; X86-SSE4-NEXT: retl
12361260
;
1237-
; X64-SSE4-LABEL: mul_v4i32_neg63:
1238-
; X64-SSE4: # %bb.0:
1239-
; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1240-
; X64-SSE4-NEXT: retq
1261+
; X64-SSE4-FAST-LABEL: mul_v4i32_neg63:
1262+
; X64-SSE4-FAST: # %bb.0:
1263+
; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1264+
; X64-SSE4-FAST-NEXT: retq
1265+
;
1266+
; X64-SSE4-SLOW-LABEL: mul_v4i32_neg63:
1267+
; X64-SSE4-SLOW: # %bb.0:
1268+
; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
1269+
; X64-SSE4-SLOW-NEXT: pslld $6, %xmm1
1270+
; X64-SSE4-SLOW-NEXT: psubd %xmm1, %xmm0
1271+
; X64-SSE4-SLOW-NEXT: retq
12411272
;
12421273
; X64-XOP-LABEL: mul_v4i32_neg63:
12431274
; X64-XOP: # %bb.0:

0 commit comments

Comments
 (0)