Skip to content

Commit 1211d97

Browse files
committed
[X86] Use SWAR techniques for some vector i8 shifts
SSE & AVX do not include instructions for shifting i8 vectors. Instead, they must be synthesized via other instructions. If pairs of i8 vectors share a shift amount, we can use SWAR techniques to substantially reduce the amount of code generated. Say we were going to execute this shift right: x >> {0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, ...} LLVM would previously generate: vpxor %xmm1, %xmm1, %xmm1 vpunpckhbw %ymm0, %ymm1, %ymm2 vpunpckhbw %ymm1, %ymm0, %ymm3 vpsllw $4, %ymm3, %ymm3 vpblendd $204, %ymm3, %ymm2, %ymm2 vpsrlw $8, %ymm2, %ymm2 vpunpcklbw %ymm0, %ymm1, %ymm3 vpunpcklbw %ymm1, %ymm0, %ymm0 vpsllw $4, %ymm0, %ymm0 vpblendd $204, %ymm0, %ymm3, %ymm0 vpsrlw $8, %ymm0, %ymm0 vpackuswb %ymm2, %ymm0, %ymm0 Instead, we can reinterpret a pair of i8 elements as an i16 and shift use the same shift amount. The only thing we need to do is mask out any bits which crossed the boundary from the top i8 to the bottom i8. This SWAR-style technique achieves: vpsrlw $4, %ymm0, %ymm1 vpblendd $170, %ymm1, %ymm0, %ymm0 vpand .LCPI0_0(%rip), %ymm0, %ymm0 This is implemented for both left and right logical shift operations. Arithmetic shifts are less well behaved here because the shift cannot also perform the sign extension for the lower 8 bits.
1 parent 8c17ed1 commit 1211d97

File tree

4 files changed

+240
-5
lines changed

4 files changed

+240
-5
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 90 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29851,17 +29851,103 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
2985129851
DAG.getNode(Opc, dl, ExtVT, R, Amt));
2985229852
}
2985329853

29854-
// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29855-
// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29854+
// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors by using
29855+
// vXi16 vector operations.
2985629856
if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
2985729857
(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
2985829858
(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
2985929859
!Subtarget.hasXOP()) {
2986029860
int NumElts = VT.getVectorNumElements();
29861+
MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29862+
// We can do this extra fast if each pair of i8 elements is shifted by the
29863+
// same amount by doing this SWAR style: use a shift to move the valid bits
29864+
// to the right position, mask out any bits which crossed from one element
29865+
// to the other.
29866+
if (Opc == ISD::SRL || Opc == ISD::SHL) {
29867+
APInt UndefElts;
29868+
SmallVector<APInt, 64> AmtBits;
29869+
if (getTargetConstantBitsFromNode(Amt, /*EltSizeInBits=*/8, UndefElts,
29870+
AmtBits, /*AllowWholeUndefs=*/true,
29871+
/*AllowPartialUndefs=*/false)) {
29872+
// This optimized lowering is only valid if the elements in a pair can
29873+
// be treated identically.
29874+
bool SameShifts = true;
29875+
SmallVector<APInt, 32> AmtBits16(NumElts / 2);
29876+
APInt UndefElts16 = APInt::getZero(AmtBits16.size());
29877+
for (unsigned SrcI = 0, E = AmtBits.size(); SrcI != E; SrcI += 2) {
29878+
unsigned DstI = SrcI / 2;
29879+
// Both elements are undef? Make a note and keep going.
29880+
if (UndefElts[SrcI] && UndefElts[SrcI + 1]) {
29881+
AmtBits16[DstI] = APInt::getZero(16);
29882+
UndefElts16.setBit(DstI);
29883+
continue;
29884+
}
29885+
// Even element is undef? We will shift it by the same shift amount as
29886+
// the odd element.
29887+
if (UndefElts[SrcI]) {
29888+
AmtBits16[DstI] = AmtBits[SrcI + 1].zext(16);
29889+
continue;
29890+
}
29891+
// Odd element is undef? We will shift it by the same shift amount as
29892+
// the even element.
29893+
if (UndefElts[SrcI + 1]) {
29894+
AmtBits16[DstI] = AmtBits[SrcI].zext(16);
29895+
continue;
29896+
}
29897+
// Both elements are equal.
29898+
if (AmtBits[SrcI] == AmtBits[SrcI + 1]) {
29899+
AmtBits16[DstI] = AmtBits[SrcI].zext(16);
29900+
continue;
29901+
}
29902+
// One of the provisional i16 elements will not have the same shift
29903+
// amount. Let's bail.
29904+
SameShifts = false;
29905+
break;
29906+
}
29907+
29908+
// We are only dealing with identical pairs and the operation is a
29909+
// logical shift.
29910+
if (SameShifts) {
29911+
// Cast the operand to vXi16.
29912+
SDValue R16 = DAG.getBitcast(VT16, R);
29913+
// Create our new vector of shift amounts.
29914+
SDValue Amt16 = getConstVector(AmtBits16, UndefElts16, VT16, DAG, dl);
29915+
// Perform the actual shift.
29916+
SDValue ShiftedR = DAG.getNode(Opc, dl, VT16, R16, Amt16);
29917+
// Now we need to construct a mask which will "drop" bits that get
29918+
// shifted past the LSB/MSB. For a logical shift left, it will look
29919+
// like:
29920+
// MaskLowBits = (0xff << Amt16) & 0xff;
29921+
// MaskHighBits = MaskLowBits << 8;
29922+
// Mask = MaskLowBits | MaskHighBits;
29923+
//
29924+
// This masking ensures that bits cannot migrate from one i8 to
29925+
// another. The construction of this mask will be constant folded.
29926+
// The mask for a logical right shift is nearly identical, the only
29927+
// difference is that 0xff is shifted right instead of left.
29928+
SDValue Cst255 = DAG.getConstant(0xff, dl, MVT::i16);
29929+
SDValue Splat255 = DAG.getSplat(VT16, dl, Cst255);
29930+
// The mask for the low bits is most simply expressed as an 8-bit
29931+
// field of all ones which is shifted in the exact same way the data
29932+
// is shifted but masked with 0xff.
29933+
SDValue MaskLowBits = DAG.getNode(Opc, dl, VT16, Splat255, Amt16);
29934+
MaskLowBits = DAG.getNode(ISD::AND, dl, VT16, MaskLowBits, Splat255);
29935+
SDValue Cst8 = DAG.getConstant(8, dl, MVT::i16);
29936+
SDValue Splat8 = DAG.getSplat(VT16, dl, Cst8);
29937+
// Thie mask for the high bits is the same as the mask for the low
29938+
// bits but shifted up by 8.
29939+
SDValue MaskHighBits = DAG.getNode(ISD::SHL, dl, VT16, MaskLowBits, Splat8);
29940+
SDValue Mask = DAG.getNode(ISD::OR, dl, VT16, MaskLowBits, MaskHighBits);
29941+
// Finally, we mask the shifted vector with the SWAR mask.
29942+
SDValue Masked = DAG.getNode(ISD::AND, dl, VT16, ShiftedR, Mask);
29943+
return DAG.getBitcast(VT, Masked);
29944+
}
29945+
}
29946+
}
2986129947
SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
2986229948

29863-
// Extend constant shift amount to vXi16 (it doesn't matter if the type
29864-
// isn't legal).
29949+
// Extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI (it
29950+
// doesn't matter if the type isn't legal).
2986529951
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
2986629952
Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
2986729953
Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
@@ -29885,7 +29971,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
2988529971
}
2988629972
}
2988729973

29888-
MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
2988929974
SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
2989029975
SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
2989129976

llvm/test/CodeGen/X86/vector-shift-lshr-128.ll

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1226,6 +1226,67 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
12261226
ret <8 x i16> %shift
12271227
}
12281228

1229+
define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind {
1230+
; SSE-LABEL: constant_shift_v16i8_pairs:
1231+
; SSE: # %bb.0:
1232+
; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096]
1233+
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1234+
; SSE-NEXT: retq
1235+
;
1236+
; AVX-LABEL: constant_shift_v16i8_pairs:
1237+
; AVX: # %bb.0:
1238+
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096]
1239+
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1240+
; AVX-NEXT: retq
1241+
;
1242+
; XOP-LABEL: constant_shift_v16i8_pairs:
1243+
; XOP: # %bb.0:
1244+
; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1245+
; XOP-NEXT: retq
1246+
;
1247+
; AVX512DQ-LABEL: constant_shift_v16i8_pairs:
1248+
; AVX512DQ: # %bb.0:
1249+
; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1250+
; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1251+
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1252+
; AVX512DQ-NEXT: vzeroupper
1253+
; AVX512DQ-NEXT: retq
1254+
;
1255+
; AVX512BW-LABEL: constant_shift_v16i8_pairs:
1256+
; AVX512BW: # %bb.0:
1257+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [7,7,2,2,4,4,6,6,1,1,2,2,3,3,4,4]
1258+
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1259+
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
1260+
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1261+
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1262+
; AVX512BW-NEXT: vzeroupper
1263+
; AVX512BW-NEXT: retq
1264+
;
1265+
; AVX512DQVL-LABEL: constant_shift_v16i8_pairs:
1266+
; AVX512DQVL: # %bb.0:
1267+
; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1268+
; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1269+
; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
1270+
; AVX512DQVL-NEXT: vzeroupper
1271+
; AVX512DQVL-NEXT: retq
1272+
;
1273+
; AVX512BWVL-LABEL: constant_shift_v16i8_pairs:
1274+
; AVX512BWVL: # %bb.0:
1275+
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1276+
; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1277+
; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
1278+
; AVX512BWVL-NEXT: vzeroupper
1279+
; AVX512BWVL-NEXT: retq
1280+
;
1281+
; X86-SSE-LABEL: constant_shift_v16i8_pairs:
1282+
; X86-SSE: # %bb.0:
1283+
; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096]
1284+
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1285+
; X86-SSE-NEXT: retl
1286+
%shift = lshr <16 x i8> %a, <i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4>
1287+
ret <16 x i8> %shift
1288+
}
1289+
12291290
define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
12301291
; SSE2-LABEL: constant_shift_v16i8:
12311292
; SSE2: # %bb.0:

llvm/test/CodeGen/X86/vector-shift-lshr-256.ll

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1345,6 +1345,72 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
13451345
ret <16 x i16> %shift
13461346
}
13471347

1348+
define <32 x i8> @constant_shift_v32i8_pairs(<32 x i8> %a) nounwind {
1349+
; AVX1-LABEL: constant_shift_v32i8_pairs:
1350+
; AVX1: # %bb.0:
1351+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1352+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [512,16384,4096,1024,32768,16384,8192,4096]
1353+
; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm1
1354+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [257,16191,3855,771,32639,16191,7967,3855]
1355+
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1356+
; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm0
1357+
; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1358+
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1359+
; AVX1-NEXT: retq
1360+
;
1361+
; AVX2-LABEL: constant_shift_v32i8_pairs:
1362+
; AVX2: # %bb.0:
1363+
; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096]
1364+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1365+
; AVX2-NEXT: retq
1366+
;
1367+
; XOPAVX1-LABEL: constant_shift_v32i8_pairs:
1368+
; XOPAVX1: # %bb.0:
1369+
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1370+
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [249,249,254,254,252,252,250,250,255,255,254,254,253,253,252,252]
1371+
; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1372+
; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1373+
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1374+
; XOPAVX1-NEXT: retq
1375+
;
1376+
; XOPAVX2-LABEL: constant_shift_v32i8_pairs:
1377+
; XOPAVX2: # %bb.0:
1378+
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1379+
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [249,249,254,254,252,252,250,250,255,255,254,254,253,253,252,252]
1380+
; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1381+
; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1382+
; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1383+
; XOPAVX2-NEXT: retq
1384+
;
1385+
; AVX512DQ-LABEL: constant_shift_v32i8_pairs:
1386+
; AVX512DQ: # %bb.0:
1387+
; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096]
1388+
; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1389+
; AVX512DQ-NEXT: retq
1390+
;
1391+
; AVX512BW-LABEL: constant_shift_v32i8_pairs:
1392+
; AVX512BW: # %bb.0:
1393+
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1394+
; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1395+
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1396+
; AVX512BW-NEXT: retq
1397+
;
1398+
; AVX512DQVL-LABEL: constant_shift_v32i8_pairs:
1399+
; AVX512DQVL: # %bb.0:
1400+
; AVX512DQVL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096]
1401+
; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1402+
; AVX512DQVL-NEXT: retq
1403+
;
1404+
; AVX512BWVL-LABEL: constant_shift_v32i8_pairs:
1405+
; AVX512BWVL: # %bb.0:
1406+
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1407+
; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1408+
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
1409+
; AVX512BWVL-NEXT: retq
1410+
%shift = lshr <32 x i8> %a, <i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4>
1411+
ret <32 x i8> %shift
1412+
}
1413+
13481414
define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
13491415
; AVX1-LABEL: constant_shift_v32i8:
13501416
; AVX1: # %bb.0:

llvm/test/CodeGen/X86/vector-shift-lshr-512.ll

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,29 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
306306
ret <32 x i16> %shift
307307
}
308308

309+
define <64 x i8> @constant_shift_v64i8_pairs(<64 x i8> %a) nounwind {
310+
; AVX512DQ-LABEL: constant_shift_v64i8_pairs:
311+
; AVX512DQ: # %bb.0:
312+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
313+
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096]
314+
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
315+
; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
316+
; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
317+
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
318+
; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [257,16191,3855,771,32639,16191,7967,3855,257,16191,3855,771,32639,16191,7967,3855,257,16191,3855,771,32639,16191,7967,3855,257,16191,3855,771,32639,16191,7967,3855]
319+
; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
320+
; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
321+
; AVX512DQ-NEXT: retq
322+
;
323+
; AVX512BW-LABEL: constant_shift_v64i8_pairs:
324+
; AVX512BW: # %bb.0:
325+
; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
326+
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
327+
; AVX512BW-NEXT: retq
328+
%shift = lshr <64 x i8> %a, <i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4>
329+
ret <64 x i8> %shift
330+
}
331+
309332
define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
310333
; AVX512DQ-LABEL: constant_shift_v64i8:
311334
; AVX512DQ: # %bb.0:

0 commit comments

Comments
 (0)