Skip to content

Commit 6f51df3

Browse files
RKSimonmahesh-attarde
authored andcommitted
[X86] LowerShiftByScalarImmediate - move shl(x,1) -> add(freeze(x),freeze(x)) to X86FixupInstTunings (llvm#161007)
Avoid the shl(x,1) -> add(freeze(x),freeze(x)) if the shift-imm if legal, and leave it to X86FixupInstTunings. Helps avoid missed optimisations due to oneuse limits, avoids unnecessary freezes and allows AVX512 to fold to mi memory folding variants. Fixes llvm#161006
1 parent 9c31154 commit 6f51df3

22 files changed

+258
-227
lines changed

llvm/lib/Target/X86/X86FixupInstTuning.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction(
277277
return true;
278278
};
279279

280+
// Is ADD(X,X) more efficient than SHL(X,1)?
281+
auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool {
282+
if (MI.getOperand(NumOperands - 1).getImm() != 1)
283+
return false;
284+
if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true))
285+
return false;
286+
LLVM_DEBUG(dbgs() << "Replacing: " << MI);
287+
{
288+
MI.setDesc(TII->get(AddOpc));
289+
MI.removeOperand(NumOperands - 1);
290+
MI.addOperand(MI.getOperand(NumOperands - 2));
291+
}
292+
LLVM_DEBUG(dbgs() << " With: " << MI);
293+
return false;
294+
};
295+
280296
switch (Opc) {
281297
case X86::BLENDPDrri:
282298
return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
@@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction(
563579
return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
564580
case X86::VUNPCKHPSZrmkz:
565581
return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);
582+
583+
case X86::PSLLWri:
584+
return ProcessShiftLeftToAdd(X86::PADDWrr);
585+
case X86::VPSLLWri:
586+
return ProcessShiftLeftToAdd(X86::VPADDWrr);
587+
case X86::VPSLLWYri:
588+
return ProcessShiftLeftToAdd(X86::VPADDWYrr);
589+
case X86::VPSLLWZ128ri:
590+
return ProcessShiftLeftToAdd(X86::VPADDWZ128rr);
591+
case X86::VPSLLWZ256ri:
592+
return ProcessShiftLeftToAdd(X86::VPADDWZ256rr);
593+
case X86::VPSLLWZri:
594+
return ProcessShiftLeftToAdd(X86::VPADDWZrr);
595+
case X86::PSLLDri:
596+
return ProcessShiftLeftToAdd(X86::PADDDrr);
597+
case X86::VPSLLDri:
598+
return ProcessShiftLeftToAdd(X86::VPADDDrr);
599+
case X86::VPSLLDYri:
600+
return ProcessShiftLeftToAdd(X86::VPADDDYrr);
601+
case X86::VPSLLDZ128ri:
602+
return ProcessShiftLeftToAdd(X86::VPADDDZ128rr);
603+
case X86::VPSLLDZ256ri:
604+
return ProcessShiftLeftToAdd(X86::VPADDDZ256rr);
605+
case X86::VPSLLDZri:
606+
return ProcessShiftLeftToAdd(X86::VPADDDZrr);
607+
case X86::PSLLQri:
608+
return ProcessShiftLeftToAdd(X86::PADDQrr);
609+
case X86::VPSLLQri:
610+
return ProcessShiftLeftToAdd(X86::VPADDQrr);
611+
case X86::VPSLLQYri:
612+
return ProcessShiftLeftToAdd(X86::VPADDQYrr);
613+
case X86::VPSLLQZ128ri:
614+
return ProcessShiftLeftToAdd(X86::VPADDQZ128rr);
615+
case X86::VPSLLQZ256ri:
616+
return ProcessShiftLeftToAdd(X86::VPADDQZ256rr);
617+
case X86::VPSLLQZri:
618+
return ProcessShiftLeftToAdd(X86::VPADDQZrr);
619+
566620
default:
567621
return false;
568622
}

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30313,22 +30313,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
3031330313

3031430314
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
3031530315

30316-
if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30317-
// Hardware support for vector shifts is sparse which makes us scalarize the
30318-
// vector operations in many cases. Also, on sandybridge ADD is faster than
30319-
// shl: (shl V, 1) -> (add (freeze V), (freeze V))
30320-
if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30321-
// R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30322-
// must be 0). (add undef, undef) however can be any value. To make this
30323-
// safe, we must freeze R to ensure that register allocation uses the same
30324-
// register for an undefined value. This ensures that the result will
30325-
// still be even and preserves the original semantics.
30326-
R = DAG.getFreeze(R);
30327-
return DAG.getNode(ISD::ADD, dl, VT, R, R);
30328-
}
30329-
30316+
if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
3033030317
return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30331-
}
3033230318

3033330319
// i64 SRA needs to be performed as partial shifts.
3033430320
if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||

llvm/test/CodeGen/X86/combine-add.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,10 +235,10 @@ define void @PR52039(ptr %pa, ptr %pb) {
235235
; SSE-NEXT: psubd %xmm1, %xmm3
236236
; SSE-NEXT: psubd %xmm0, %xmm2
237237
; SSE-NEXT: movdqa %xmm2, %xmm0
238-
; SSE-NEXT: paddd %xmm2, %xmm0
238+
; SSE-NEXT: paddd %xmm0, %xmm0
239239
; SSE-NEXT: paddd %xmm2, %xmm0
240240
; SSE-NEXT: movdqa %xmm3, %xmm1
241-
; SSE-NEXT: paddd %xmm3, %xmm1
241+
; SSE-NEXT: paddd %xmm1, %xmm1
242242
; SSE-NEXT: paddd %xmm3, %xmm1
243243
; SSE-NEXT: movdqu %xmm3, 16(%rsi)
244244
; SSE-NEXT: movdqu %xmm2, (%rsi)

llvm/test/CodeGen/X86/combine-mul.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
8181
; SSE-LABEL: combine_vec_mul_pow2c:
8282
; SSE: # %bb.0:
8383
; SSE-NEXT: movdqa %xmm0, %xmm2
84-
; SSE-NEXT: paddq %xmm0, %xmm2
84+
; SSE-NEXT: paddq %xmm2, %xmm2
8585
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
8686
; SSE-NEXT: movdqa %xmm1, %xmm2
8787
; SSE-NEXT: psllq $4, %xmm2

llvm/test/CodeGen/X86/combine-sdiv.ll

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2187,29 +2187,28 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
21872187
; SSE41-NEXT: pxor %xmm0, %xmm0
21882188
; SSE41-NEXT: pxor %xmm3, %xmm3
21892189
; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
2190-
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
21912190
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
21922191
; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
21932192
; SSE41-NEXT: psrlw $8, %xmm3
2194-
; SSE41-NEXT: paddw %xmm4, %xmm4
2195-
; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
2196-
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5],xmm4[6],xmm2[7]
2193+
; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
2194+
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2195+
; SSE41-NEXT: paddw %xmm2, %xmm2
2196+
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2],xmm0[3,4,5],xmm2[6],xmm0[7]
21972197
; SSE41-NEXT: psrlw $8, %xmm2
21982198
; SSE41-NEXT: packuswb %xmm3, %xmm2
21992199
; SSE41-NEXT: paddb %xmm1, %xmm2
22002200
; SSE41-NEXT: movdqa %xmm2, %xmm0
22012201
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
22022202
; SSE41-NEXT: psraw $8, %xmm0
22032203
; SSE41-NEXT: movdqa %xmm0, %xmm3
2204-
; SSE41-NEXT: paddw %xmm0, %xmm3
2205-
; SSE41-NEXT: psllw $7, %xmm0
2206-
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
2207-
; SSE41-NEXT: psrlw $8, %xmm0
2204+
; SSE41-NEXT: psllw $7, %xmm3
2205+
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7]
2206+
; SSE41-NEXT: psrlw $8, %xmm3
22082207
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
22092208
; SSE41-NEXT: psraw $8, %xmm2
22102209
; SSE41-NEXT: psllw $7, %xmm2
22112210
; SSE41-NEXT: psrlw $8, %xmm2
2212-
; SSE41-NEXT: packuswb %xmm0, %xmm2
2211+
; SSE41-NEXT: packuswb %xmm3, %xmm2
22132212
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
22142213
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
22152214
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
@@ -2225,18 +2224,17 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
22252224
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
22262225
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128]
22272226
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2228-
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2229-
; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
2230-
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3
2231-
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5],xmm2[6],xmm3[7]
2227+
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2
2228+
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2229+
; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
2230+
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5],xmm3[6],xmm2[7]
22322231
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
22332232
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
22342233
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
22352234
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
22362235
; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
2237-
; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
2238-
; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
2239-
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
2236+
; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3
2237+
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7]
22402238
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
22412239
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
22422240
; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1

llvm/test/CodeGen/X86/known-signbits-shl.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ define void @computeNumSignBits_shl_zext_vec_3(<2 x i8> %x, ptr %p) nounwind {
137137
; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
138138
; X64-NEXT: por %xmm2, %xmm1
139139
; X64-NEXT: movdqa %xmm0, %xmm2
140-
; X64-NEXT: paddw %xmm0, %xmm2
140+
; X64-NEXT: paddw %xmm2, %xmm2
141141
; X64-NEXT: movdqa %xmm2, %xmm3
142142
; X64-NEXT: psraw $1, %xmm3
143143
; X64-NEXT: pcmpeqw %xmm0, %xmm3

llvm/test/CodeGen/X86/masked_gather_scatter.ll

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4806,9 +4806,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
48064806
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
48074807
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
48084808
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
4809-
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
4809+
; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
48104810
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4811-
; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
48124811
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
48134812
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
48144813
; X64-KNL-NEXT: retq
@@ -4830,9 +4829,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
48304829
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
48314830
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
48324831
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
4833-
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
4832+
; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
48344833
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4835-
; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
48364834
; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
48374835
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
48384836
; X64-SKX-SMALL-NEXT: retq
@@ -4842,10 +4840,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
48424840
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
48434841
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
48444842
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
4845-
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
4843+
; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
48464844
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
48474845
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
4848-
; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0
48494846
; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
48504847
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
48514848
; X64-SKX-LARGE-NEXT: retq
@@ -4875,9 +4872,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
48754872
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
48764873
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
48774874
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
4878-
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
4875+
; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
48794876
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4880-
; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
48814877
; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
48824878
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
48834879
; X64-KNL-NEXT: retq
@@ -4899,9 +4895,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
48994895
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
49004896
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
49014897
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
4902-
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
4898+
; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
49034899
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4904-
; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
49054900
; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
49064901
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
49074902
; X64-SKX-SMALL-NEXT: retq
@@ -4911,10 +4906,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
49114906
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
49124907
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
49134908
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
4914-
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
4909+
; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
49154910
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
49164911
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
4917-
; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0
49184912
; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
49194913
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
49204914
; X64-SKX-LARGE-NEXT: retq
@@ -4944,9 +4938,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
49444938
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
49454939
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
49464940
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
4947-
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
4948-
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4949-
; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm2
4941+
; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
4942+
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
49504943
; X64-KNL-NEXT: kmovw %k1, %k2
49514944
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
49524945
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
@@ -4972,9 +4965,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
49724965
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
49734966
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
49744967
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
4975-
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
4976-
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4977-
; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm2
4968+
; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
4969+
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
49784970
; X64-SKX-SMALL-NEXT: kmovw %k1, %k2
49794971
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
49804972
; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
@@ -4986,10 +4978,9 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
49864978
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
49874979
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
49884980
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
4989-
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
4981+
; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
49904982
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4991-
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
4992-
; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm2
4983+
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm2
49934984
; X64-SKX-LARGE-NEXT: kmovw %k1, %k2
49944985
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
49954986
; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}

llvm/test/CodeGen/X86/oddsubvector.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -155,18 +155,18 @@ define <16 x i32> @PR42819(ptr %a0) {
155155
define void @PR42833() {
156156
; SSE2-LABEL: PR42833:
157157
; SSE2: # %bb.0:
158+
; SSE2-NEXT: movl b(%rip), %eax
158159
; SSE2-NEXT: movdqa c+144(%rip), %xmm2
159160
; SSE2-NEXT: movdqa c+128(%rip), %xmm0
160-
; SSE2-NEXT: movd %xmm0, %eax
161-
; SSE2-NEXT: addl b(%rip), %eax
161+
; SSE2-NEXT: addl c+128(%rip), %eax
162162
; SSE2-NEXT: movd %eax, %xmm1
163163
; SSE2-NEXT: movd %eax, %xmm3
164164
; SSE2-NEXT: paddd %xmm0, %xmm3
165165
; SSE2-NEXT: movdqa d+144(%rip), %xmm4
166166
; SSE2-NEXT: psubd %xmm2, %xmm4
167167
; SSE2-NEXT: paddd %xmm2, %xmm2
168168
; SSE2-NEXT: movdqa %xmm0, %xmm5
169-
; SSE2-NEXT: paddd %xmm0, %xmm5
169+
; SSE2-NEXT: paddd %xmm5, %xmm5
170170
; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
171171
; SSE2-NEXT: movdqa %xmm2, c+144(%rip)
172172
; SSE2-NEXT: movaps %xmm5, c+128(%rip)
@@ -191,17 +191,17 @@ define void @PR42833() {
191191
;
192192
; SSE42-LABEL: PR42833:
193193
; SSE42: # %bb.0:
194+
; SSE42-NEXT: movl b(%rip), %eax
194195
; SSE42-NEXT: movdqa c+144(%rip), %xmm1
195196
; SSE42-NEXT: movdqa c+128(%rip), %xmm0
196-
; SSE42-NEXT: movd %xmm0, %eax
197-
; SSE42-NEXT: addl b(%rip), %eax
197+
; SSE42-NEXT: addl c+128(%rip), %eax
198198
; SSE42-NEXT: movd %eax, %xmm2
199199
; SSE42-NEXT: paddd %xmm0, %xmm2
200200
; SSE42-NEXT: movdqa d+144(%rip), %xmm3
201201
; SSE42-NEXT: psubd %xmm1, %xmm3
202202
; SSE42-NEXT: paddd %xmm1, %xmm1
203203
; SSE42-NEXT: movdqa %xmm0, %xmm4
204-
; SSE42-NEXT: paddd %xmm0, %xmm4
204+
; SSE42-NEXT: paddd %xmm4, %xmm4
205205
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
206206
; SSE42-NEXT: movdqa %xmm1, c+144(%rip)
207207
; SSE42-NEXT: movdqa %xmm4, c+128(%rip)

llvm/test/CodeGen/X86/pr62286.ll

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,27 +26,33 @@ define i64 @PR62286(i32 %a) {
2626
; AVX1-LABEL: PR62286:
2727
; AVX1: # %bb.0:
2828
; AVX1-NEXT: vmovd %edi, %xmm0
29-
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
29+
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
30+
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
31+
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7]
3032
; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
31-
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
32-
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
33-
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
33+
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
34+
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
35+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
36+
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
37+
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
38+
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3439
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
35-
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
36-
; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
37-
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
40+
; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
3841
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3942
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
4043
; AVX1-NEXT: vmovq %xmm0, %rax
44+
; AVX1-NEXT: vzeroupper
4145
; AVX1-NEXT: retq
4246
;
4347
; AVX2-LABEL: PR62286:
4448
; AVX2: # %bb.0:
4549
; AVX2-NEXT: vmovd %edi, %xmm0
46-
; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1
47-
; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
48-
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
49-
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
50+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
51+
; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm1
52+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
53+
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
54+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
55+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
5056
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
5157
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5258
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
@@ -59,12 +65,12 @@ define i64 @PR62286(i32 %a) {
5965
; AVX512-LABEL: PR62286:
6066
; AVX512: # %bb.0:
6167
; AVX512-NEXT: vmovd %edi, %xmm0
62-
; AVX512-NEXT: movb $8, %al
68+
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
69+
; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm1
70+
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
71+
; AVX512-NEXT: movw $4369, %ax # imm = 0x1111
6372
; AVX512-NEXT: kmovd %eax, %k1
64-
; AVX512-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
65-
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
66-
; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0
67-
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
73+
; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
6874
; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
6975
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7076
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0

0 commit comments

Comments
 (0)