Skip to content

Commit 9f773b8

Browse files
committed
Improve variable 8-bit shifts on AVX512BW
The existing implementation used three shifts by an immediate followed by selects. This commit changes the implementation to use two variable 16 bit shifts instead.
1 parent 34ed1dc commit 9f773b8

File tree

5 files changed

+123
-105
lines changed

5 files changed

+123
-105
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30968,6 +30968,76 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3096830968
return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
3096930969
}
3097030970

30971+
if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) {
30972+
// On AVX512BW, we can use variable 16-bit shifts to implement variable
30973+
// 8-bit shifts. For this, we split the input into two vectors, RLo and RHi.
30974+
// The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane
30975+
// of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors
30976+
// can efficiently be merged together using a masked move.
30977+
MVT ExtVT = MVT::v32i16;
30978+
30979+
// When used in a vectorshuffle, selects even-index lanes from the first
30980+
// vector and odd index lanes from the second vector.
30981+
SmallVector<int, 64> InterleaveIndices;
30982+
for (unsigned i = 0; i < 64; ++i) {
30983+
unsigned offset = (i % 2 == 0) ? 0 : 64;
30984+
InterleaveIndices.push_back(i + offset);
30985+
}
30986+
30987+
SDValue zero = DAG.getConstant(0, dl, VT);
30988+
SDValue eight = DAG.getTargetConstant(8, dl, MVT::i8);
30989+
SDValue RLo, RHi;
30990+
30991+
// Isolate lower and upper lanes of Amt by shuffling zeros into AmtLo and
30992+
// right shifting AmtHi.
30993+
SDValue AmtLo = DAG.getBitcast(
30994+
ExtVT, DAG.getVectorShuffle(VT, dl, Amt, zero, InterleaveIndices));
30995+
SDValue AmtHi = DAG.getNode(X86ISD::VSRLI, dl, ExtVT,
30996+
DAG.getBitcast(ExtVT, Amt), eight);
30997+
unsigned int ShiftOp;
30998+
switch (Opc) {
30999+
case ISD::SHL:
31000+
// Because we shift left, no bits from the high half can influence the low
31001+
// half, so we don't need to mask RLo. We do however need to mask RHi, to
31002+
// prevent high bits of an even lane overflowing into low bits of an odd
31003+
// lane.
31004+
RLo = DAG.getBitcast(ExtVT, R);
31005+
RHi = DAG.getBitcast(
31006+
ExtVT, DAG.getVectorShuffle(VT, dl, zero, R, InterleaveIndices));
31007+
ShiftOp = X86ISD::VSHLV;
31008+
break;
31009+
case ISD::SRL:
31010+
// Same idea as above, but this time we need to make sure no low bits of
31011+
// an odd lane can overflow into high bits of an even lane.
31012+
RLo = DAG.getBitcast(
31013+
ExtVT, DAG.getVectorShuffle(VT, dl, R, zero, InterleaveIndices));
31014+
RHi = DAG.getBitcast(ExtVT, R);
31015+
ShiftOp = X86ISD::VSRLV;
31016+
break;
31017+
case ISD::SRA:
31018+
// For arithmetic right shifts, we want to sign extend each even lane of R
31019+
// such that the upper half of the corresponding lane of RLo is 0 or -1
31020+
// depending on the sign bit of the original lane. We do this using 2
31021+
// immediate shifts.
31022+
RHi = DAG.getBitcast(ExtVT, R);
31023+
RLo = DAG.getNode(X86ISD::VSHLI, dl, ExtVT, RHi, eight);
31024+
RLo = DAG.getNode(X86ISD::VSRAI, dl, ExtVT, RLo, eight);
31025+
ShiftOp = X86ISD::VSRAV;
31026+
break;
31027+
default:
31028+
llvm_unreachable("Unexpected Shift Op");
31029+
return SDValue();
31030+
}
31031+
31032+
SDValue ShiftedLo =
31033+
DAG.getBitcast(VT, DAG.getNode(ShiftOp, dl, ExtVT, RLo, AmtLo));
31034+
SDValue ShiftedHi =
31035+
DAG.getBitcast(VT, DAG.getNode(ShiftOp, dl, ExtVT, RHi, AmtHi));
31036+
31037+
return DAG.getVectorShuffle(VT, dl, ShiftedLo, ShiftedHi,
31038+
InterleaveIndices);
31039+
}
31040+
3097131041
if (VT == MVT::v16i8 ||
3097231042
(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
3097331043
(VT == MVT::v64i8 && Subtarget.hasBWI())) {

llvm/test/CodeGen/X86/gfni-shifts.ll

Lines changed: 27 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
16841684
;
16851685
; GFNIAVX512BW-LABEL: var_shl_v64i8:
16861686
; GFNIAVX512BW: # %bb.0:
1687-
; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
1688-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
1689-
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
1690-
; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
1691-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
1692-
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
1693-
; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
1694-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
1695-
; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
1687+
; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
1688+
; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3
1689+
; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm3, %zmm2
1690+
; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
1691+
; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1692+
; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
1693+
; GFNIAVX512BW-NEXT: kmovq %rax, %k1
1694+
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
16961695
; GFNIAVX512BW-NEXT: retq
16971696
%shift = shl <64 x i8> %a, %b
16981697
ret <64 x i8> %shift
@@ -1876,15 +1875,16 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
18761875
;
18771876
; GFNIAVX512BW-LABEL: var_lshr_v64i8:
18781877
; GFNIAVX512BW: # %bb.0:
1879-
; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
1880-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
1881-
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
1882-
; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
1883-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
1884-
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
1885-
; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
1886-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
1887-
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
1878+
; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1879+
; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
1880+
; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
1881+
; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
1882+
; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
1883+
; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1884+
; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
1885+
; GFNIAVX512BW-NEXT: kmovq %rax, %k1
1886+
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
1887+
; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
18881888
; GFNIAVX512BW-NEXT: retq
18891889
%shift = lshr <64 x i8> %a, %b
18901890
ret <64 x i8> %shift
@@ -2232,36 +2232,15 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
22322232
;
22332233
; GFNIAVX512BW-LABEL: var_ashr_v64i8:
22342234
; GFNIAVX512BW: # %bb.0:
2235-
; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2236-
; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
2237-
; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
2238-
; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2239-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
2240-
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
2241-
; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
2242-
; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
2243-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1
2244-
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
2245-
; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
2246-
; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
2247-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
2248-
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
2249-
; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
2250-
; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2251-
; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
2252-
; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2253-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
2254-
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
2255-
; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
2256-
; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
2257-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
2258-
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
2259-
; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
2260-
; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
2261-
; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
2262-
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
2263-
; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
2264-
; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
2235+
; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
2236+
; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
2237+
; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
2238+
; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
2239+
; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
2240+
; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
2241+
; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
2242+
; GFNIAVX512BW-NEXT: kmovq %rax, %k1
2243+
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
22652244
; GFNIAVX512BW-NEXT: retq
22662245
%shift = ashr <64 x i8> %a, %b
22672246
ret <64 x i8> %shift

llvm/test/CodeGen/X86/vector-shift-ashr-512.ll

Lines changed: 9 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -106,36 +106,15 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
106106
;
107107
; AVX512BW-LABEL: var_shift_v64i8:
108108
; AVX512BW: # %bb.0:
109-
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
110-
; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
111-
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
112-
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
113-
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
114-
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
115-
; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
116-
; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
117-
; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
118-
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
119-
; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
120-
; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
121-
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
122-
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
123-
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
124-
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
125-
; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
126-
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
127-
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
128-
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
129-
; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
130-
; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
131-
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
132-
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
133-
; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
134-
; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
135-
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
136-
; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
137-
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
138-
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
109+
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
110+
; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
111+
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
112+
; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
113+
; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
114+
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
115+
; AVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
116+
; AVX512BW-NEXT: kmovq %rax, %k1
117+
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
139118
; AVX512BW-NEXT: retq
140119
%shift = ashr <64 x i8> %a, %b
141120
ret <64 x i8> %shift

llvm/test/CodeGen/X86/vector-shift-lshr-512.ll

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -85,21 +85,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
8585
;
8686
; AVX512BW-LABEL: var_shift_v64i8:
8787
; AVX512BW: # %bb.0:
88-
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
89-
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
90-
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
91-
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
92-
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
93-
; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2
94-
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
95-
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
96-
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
97-
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
98-
; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2
99-
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
100-
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
101-
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
102-
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
88+
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
89+
; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
90+
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
91+
; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
92+
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
93+
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
94+
; AVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
95+
; AVX512BW-NEXT: kmovq %rax, %k1
96+
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
97+
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
10398
; AVX512BW-NEXT: retq
10499
%shift = lshr <64 x i8> %a, %b
105100
ret <64 x i8> %shift

llvm/test/CodeGen/X86/vector-shift-shl-512.ll

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
8282
;
8383
; AVX512BW-LABEL: var_shift_v64i8:
8484
; AVX512BW: # %bb.0:
85-
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
86-
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
87-
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
88-
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
89-
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
90-
; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2
91-
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
92-
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
93-
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
85+
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
86+
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3
87+
; AVX512BW-NEXT: vpsllvw %zmm2, %zmm3, %zmm2
88+
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
89+
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
90+
; AVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
91+
; AVX512BW-NEXT: kmovq %rax, %k1
9492
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
95-
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
96-
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
97-
; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
9893
; AVX512BW-NEXT: retq
9994
%shift = shl <64 x i8> %a, %b
10095
ret <64 x i8> %shift

0 commit comments

Comments
 (0)