Skip to content

Commit f84b784

Browse files
authored
[X86] LowerShiftByScalarImmediate - move shl(x,1) -> add(freeze(x),freeze(x)) to X86FixupInstTunings (#161007)
Avoid the shl(x,1) -> add(freeze(x),freeze(x)) if the shift-imm if legal, and leave it to X86FixupInstTunings. Helps avoid missed optimisations due to oneuse limits, avoids unnecessary freezes and allows AVX512 to fold to mi memory folding variants. Fixes #161006
1 parent 3a3a4fb commit f84b784

22 files changed

+258
-227
lines changed

llvm/lib/Target/X86/X86FixupInstTuning.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction(
277277
return true;
278278
};
279279

280+
// Is ADD(X,X) more efficient than SHL(X,1)?
281+
auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool {
282+
if (MI.getOperand(NumOperands - 1).getImm() != 1)
283+
return false;
284+
if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true))
285+
return false;
286+
LLVM_DEBUG(dbgs() << "Replacing: " << MI);
287+
{
288+
MI.setDesc(TII->get(AddOpc));
289+
MI.removeOperand(NumOperands - 1);
290+
MI.addOperand(MI.getOperand(NumOperands - 2));
291+
}
292+
LLVM_DEBUG(dbgs() << " With: " << MI);
293+
return false;
294+
};
295+
280296
switch (Opc) {
281297
case X86::BLENDPDrri:
282298
return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
@@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction(
563579
return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
564580
case X86::VUNPCKHPSZrmkz:
565581
return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);
582+
583+
case X86::PSLLWri:
584+
return ProcessShiftLeftToAdd(X86::PADDWrr);
585+
case X86::VPSLLWri:
586+
return ProcessShiftLeftToAdd(X86::VPADDWrr);
587+
case X86::VPSLLWYri:
588+
return ProcessShiftLeftToAdd(X86::VPADDWYrr);
589+
case X86::VPSLLWZ128ri:
590+
return ProcessShiftLeftToAdd(X86::VPADDWZ128rr);
591+
case X86::VPSLLWZ256ri:
592+
return ProcessShiftLeftToAdd(X86::VPADDWZ256rr);
593+
case X86::VPSLLWZri:
594+
return ProcessShiftLeftToAdd(X86::VPADDWZrr);
595+
case X86::PSLLDri:
596+
return ProcessShiftLeftToAdd(X86::PADDDrr);
597+
case X86::VPSLLDri:
598+
return ProcessShiftLeftToAdd(X86::VPADDDrr);
599+
case X86::VPSLLDYri:
600+
return ProcessShiftLeftToAdd(X86::VPADDDYrr);
601+
case X86::VPSLLDZ128ri:
602+
return ProcessShiftLeftToAdd(X86::VPADDDZ128rr);
603+
case X86::VPSLLDZ256ri:
604+
return ProcessShiftLeftToAdd(X86::VPADDDZ256rr);
605+
case X86::VPSLLDZri:
606+
return ProcessShiftLeftToAdd(X86::VPADDDZrr);
607+
case X86::PSLLQri:
608+
return ProcessShiftLeftToAdd(X86::PADDQrr);
609+
case X86::VPSLLQri:
610+
return ProcessShiftLeftToAdd(X86::VPADDQrr);
611+
case X86::VPSLLQYri:
612+
return ProcessShiftLeftToAdd(X86::VPADDQYrr);
613+
case X86::VPSLLQZ128ri:
614+
return ProcessShiftLeftToAdd(X86::VPADDQZ128rr);
615+
case X86::VPSLLQZ256ri:
616+
return ProcessShiftLeftToAdd(X86::VPADDQZ256rr);
617+
case X86::VPSLLQZri:
618+
return ProcessShiftLeftToAdd(X86::VPADDQZrr);
619+
566620
default:
567621
return false;
568622
}

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30313,22 +30313,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
3031330313

3031430314
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
3031530315

30316-
if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30317-
// Hardware support for vector shifts is sparse which makes us scalarize the
30318-
// vector operations in many cases. Also, on sandybridge ADD is faster than
30319-
// shl: (shl V, 1) -> (add (freeze V), (freeze V))
30320-
if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30321-
// R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30322-
// must be 0). (add undef, undef) however can be any value. To make this
30323-
// safe, we must freeze R to ensure that register allocation uses the same
30324-
// register for an undefined value. This ensures that the result will
30325-
// still be even and preserves the original semantics.
30326-
R = DAG.getFreeze(R);
30327-
return DAG.getNode(ISD::ADD, dl, VT, R, R);
30328-
}
30329-
30316+
if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
3033030317
return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30331-
}
3033230318

3033330319
// i64 SRA needs to be performed as partial shifts.
3033430320
if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||

llvm/test/CodeGen/X86/combine-add.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,10 +235,10 @@ define void @PR52039(ptr %pa, ptr %pb) {
235235
; SSE-NEXT: psubd %xmm1, %xmm3
236236
; SSE-NEXT: psubd %xmm0, %xmm2
237237
; SSE-NEXT: movdqa %xmm2, %xmm0
238-
; SSE-NEXT: paddd %xmm2, %xmm0
238+
; SSE-NEXT: paddd %xmm0, %xmm0
239239
; SSE-NEXT: paddd %xmm2, %xmm0
240240
; SSE-NEXT: movdqa %xmm3, %xmm1
241-
; SSE-NEXT: paddd %xmm3, %xmm1
241+
; SSE-NEXT: paddd %xmm1, %xmm1
242242
; SSE-NEXT: paddd %xmm3, %xmm1
243243
; SSE-NEXT: movdqu %xmm3, 16(%rsi)
244244
; SSE-NEXT: movdqu %xmm2, (%rsi)

llvm/test/CodeGen/X86/combine-mul.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
8181
; SSE-LABEL: combine_vec_mul_pow2c:
8282
; SSE: # %bb.0:
8383
; SSE-NEXT: movdqa %xmm0, %xmm2
84-
; SSE-NEXT: paddq %xmm0, %xmm2
84+
; SSE-NEXT: paddq %xmm2, %xmm2
8585
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
8686
; SSE-NEXT: movdqa %xmm1, %xmm2
8787
; SSE-NEXT: psllq $4, %xmm2

llvm/test/CodeGen/X86/combine-sdiv.ll

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2187,29 +2187,28 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
21872187
; SSE41-NEXT: pxor %xmm0, %xmm0
21882188
; SSE41-NEXT: pxor %xmm3, %xmm3
21892189
; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
2190-
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
21912190
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
21922191
; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
21932192
; SSE41-NEXT: psrlw $8, %xmm3
2194-
; SSE41-NEXT: paddw %xmm4, %xmm4
2195-
; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
2196-
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5],xmm4[6],xmm2[7]
2193+
; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
2194+
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2195+
; SSE41-NEXT: paddw %xmm2, %xmm2
2196+
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2],xmm0[3,4,5],xmm2[6],xmm0[7]
21972197
; SSE41-NEXT: psrlw $8, %xmm2
21982198
; SSE41-NEXT: packuswb %xmm3, %xmm2
21992199
; SSE41-NEXT: paddb %xmm1, %xmm2
22002200
; SSE41-NEXT: movdqa %xmm2, %xmm0
22012201
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
22022202
; SSE41-NEXT: psraw $8, %xmm0
22032203
; SSE41-NEXT: movdqa %xmm0, %xmm3
2204-
; SSE41-NEXT: paddw %xmm0, %xmm3
2205-
; SSE41-NEXT: psllw $7, %xmm0
2206-
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
2207-
; SSE41-NEXT: psrlw $8, %xmm0
2204+
; SSE41-NEXT: psllw $7, %xmm3
2205+
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7]
2206+
; SSE41-NEXT: psrlw $8, %xmm3
22082207
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
22092208
; SSE41-NEXT: psraw $8, %xmm2
22102209
; SSE41-NEXT: psllw $7, %xmm2
22112210
; SSE41-NEXT: psrlw $8, %xmm2
2212-
; SSE41-NEXT: packuswb %xmm0, %xmm2
2211+
; SSE41-NEXT: packuswb %xmm3, %xmm2
22132212
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
22142213
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
22152214
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
@@ -2225,18 +2224,17 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
22252224
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
22262225
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128]
22272226
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2228-
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2229-
; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
2230-
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3
2231-
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5],xmm2[6],xmm3[7]
2227+
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2
2228+
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2229+
; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
2230+
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5],xmm3[6],xmm2[7]
22322231
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
22332232
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
22342233
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
22352234
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
22362235
; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
2237-
; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
2238-
; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
2239-
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
2236+
; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3
2237+
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7]
22402238
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
22412239
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
22422240
; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1

llvm/test/CodeGen/X86/known-signbits-shl.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ define void @computeNumSignBits_shl_zext_vec_3(<2 x i8> %x, ptr %p) nounwind {
137137
; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
138138
; X64-NEXT: por %xmm2, %xmm1
139139
; X64-NEXT: movdqa %xmm0, %xmm2
140-
; X64-NEXT: paddw %xmm0, %xmm2
140+
; X64-NEXT: paddw %xmm2, %xmm2
141141
; X64-NEXT: movdqa %xmm2, %xmm3
142142
; X64-NEXT: psraw $1, %xmm3
143143
; X64-NEXT: pcmpeqw %xmm0, %xmm3

llvm/test/CodeGen/X86/masked_gather_scatter.ll

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4806,9 +4806,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
48064806
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
48074807
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
48084808
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
4809-
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
4809+
; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
48104810
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4811-
; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
48124811
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
48134812
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
48144813
; X64-KNL-NEXT: retq
@@ -4830,9 +4829,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
48304829
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
48314830
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
48324831
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
4833-
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
4832+
; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
48344833
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4835-
; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
48364834
; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
48374835
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
48384836
; X64-SKX-SMALL-NEXT: retq
@@ -4842,10 +4840,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
48424840
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
48434841
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
48444842
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
4845-
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
4843+
; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
48464844
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
48474845
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
4848-
; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0
48494846
; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
48504847
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
48514848
; X64-SKX-LARGE-NEXT: retq
@@ -4875,9 +4872,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
48754872
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
48764873
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
48774874
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
4878-
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
4875+
; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
48794876
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4880-
; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
48814877
; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
48824878
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
48834879
; X64-KNL-NEXT: retq
@@ -4899,9 +4895,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
48994895
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
49004896
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
49014897
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
4902-
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
4898+
; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
49034899
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4904-
; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
49054900
; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
49064901
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
49074902
; X64-SKX-SMALL-NEXT: retq
@@ -4911,10 +4906,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
49114906
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
49124907
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
49134908
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
4914-
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
4909+
; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
49154910
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
49164911
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
4917-
; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0
49184912
; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
49194913
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
49204914
; X64-SKX-LARGE-NEXT: retq
@@ -4944,9 +4938,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
49444938
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
49454939
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
49464940
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
4947-
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
4948-
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4949-
; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm2
4941+
; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
4942+
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
49504943
; X64-KNL-NEXT: kmovw %k1, %k2
49514944
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
49524945
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
@@ -4972,9 +4965,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
49724965
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
49734966
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
49744967
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
4975-
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
4976-
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4977-
; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm2
4968+
; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
4969+
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
49784970
; X64-SKX-SMALL-NEXT: kmovw %k1, %k2
49794971
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
49804972
; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
@@ -4986,10 +4978,9 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
49864978
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
49874979
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
49884980
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
4989-
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
4981+
; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
49904982
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4991-
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
4992-
; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm2
4983+
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm2
49934984
; X64-SKX-LARGE-NEXT: kmovw %k1, %k2
49944985
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
49954986
; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}

llvm/test/CodeGen/X86/oddsubvector.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -155,18 +155,18 @@ define <16 x i32> @PR42819(ptr %a0) {
155155
define void @PR42833() {
156156
; SSE2-LABEL: PR42833:
157157
; SSE2: # %bb.0:
158+
; SSE2-NEXT: movl b(%rip), %eax
158159
; SSE2-NEXT: movdqa c+144(%rip), %xmm2
159160
; SSE2-NEXT: movdqa c+128(%rip), %xmm0
160-
; SSE2-NEXT: movd %xmm0, %eax
161-
; SSE2-NEXT: addl b(%rip), %eax
161+
; SSE2-NEXT: addl c+128(%rip), %eax
162162
; SSE2-NEXT: movd %eax, %xmm1
163163
; SSE2-NEXT: movd %eax, %xmm3
164164
; SSE2-NEXT: paddd %xmm0, %xmm3
165165
; SSE2-NEXT: movdqa d+144(%rip), %xmm4
166166
; SSE2-NEXT: psubd %xmm2, %xmm4
167167
; SSE2-NEXT: paddd %xmm2, %xmm2
168168
; SSE2-NEXT: movdqa %xmm0, %xmm5
169-
; SSE2-NEXT: paddd %xmm0, %xmm5
169+
; SSE2-NEXT: paddd %xmm5, %xmm5
170170
; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
171171
; SSE2-NEXT: movdqa %xmm2, c+144(%rip)
172172
; SSE2-NEXT: movaps %xmm5, c+128(%rip)
@@ -191,17 +191,17 @@ define void @PR42833() {
191191
;
192192
; SSE42-LABEL: PR42833:
193193
; SSE42: # %bb.0:
194+
; SSE42-NEXT: movl b(%rip), %eax
194195
; SSE42-NEXT: movdqa c+144(%rip), %xmm1
195196
; SSE42-NEXT: movdqa c+128(%rip), %xmm0
196-
; SSE42-NEXT: movd %xmm0, %eax
197-
; SSE42-NEXT: addl b(%rip), %eax
197+
; SSE42-NEXT: addl c+128(%rip), %eax
198198
; SSE42-NEXT: movd %eax, %xmm2
199199
; SSE42-NEXT: paddd %xmm0, %xmm2
200200
; SSE42-NEXT: movdqa d+144(%rip), %xmm3
201201
; SSE42-NEXT: psubd %xmm1, %xmm3
202202
; SSE42-NEXT: paddd %xmm1, %xmm1
203203
; SSE42-NEXT: movdqa %xmm0, %xmm4
204-
; SSE42-NEXT: paddd %xmm0, %xmm4
204+
; SSE42-NEXT: paddd %xmm4, %xmm4
205205
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
206206
; SSE42-NEXT: movdqa %xmm1, c+144(%rip)
207207
; SSE42-NEXT: movdqa %xmm4, c+128(%rip)

llvm/test/CodeGen/X86/pr62286.ll

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,27 +26,33 @@ define i64 @PR62286(i32 %a) {
2626
; AVX1-LABEL: PR62286:
2727
; AVX1: # %bb.0:
2828
; AVX1-NEXT: vmovd %edi, %xmm0
29-
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
29+
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
30+
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
31+
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7]
3032
; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
31-
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
32-
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
33-
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
33+
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
34+
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
35+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
36+
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
37+
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
38+
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3439
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
35-
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
36-
; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
37-
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
40+
; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
3841
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3942
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
4043
; AVX1-NEXT: vmovq %xmm0, %rax
44+
; AVX1-NEXT: vzeroupper
4145
; AVX1-NEXT: retq
4246
;
4347
; AVX2-LABEL: PR62286:
4448
; AVX2: # %bb.0:
4549
; AVX2-NEXT: vmovd %edi, %xmm0
46-
; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1
47-
; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
48-
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
49-
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
50+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
51+
; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm1
52+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
53+
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
54+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
55+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
5056
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
5157
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5258
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
@@ -59,12 +65,12 @@ define i64 @PR62286(i32 %a) {
5965
; AVX512-LABEL: PR62286:
6066
; AVX512: # %bb.0:
6167
; AVX512-NEXT: vmovd %edi, %xmm0
62-
; AVX512-NEXT: movb $8, %al
68+
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
69+
; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm1
70+
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
71+
; AVX512-NEXT: movw $4369, %ax # imm = 0x1111
6372
; AVX512-NEXT: kmovd %eax, %k1
64-
; AVX512-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
65-
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
66-
; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0
67-
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
73+
; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
6874
; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
6975
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7076
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0

0 commit comments

Comments
 (0)