-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[X86] Lower CTTZ/CTLZ vXi8 vectors using GF2P8AFFINEQB #118012
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesCTTZ can be lowered using GF2P8AFFINEQB if we isolate the lowest set bit (or is zero) and the use GF2P8AFFINEQB to perform a look up With CTTZ, CTLZ can be lowered as CTTZ(BITREVERSE()) As discussed on #110308 Patch is 79.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118012.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d490de06590f78..528c3002536f12 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1330,6 +1330,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BITREVERSE, MVT::i16, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
@@ -1695,6 +1697,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
setOperationAction(ISD::MGATHER, VT, Custom);
}
+
+ if (Subtarget.hasGFNI()) {
+ setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v32i8, Custom);
+ }
}
if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
@@ -2079,6 +2086,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
setOperationAction(ISD::FABS, MVT::v32f16, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom);
+
+ if (Subtarget.hasGFNI()) {
+ setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
+ setOperationAction(ISD::CTTZ, MVT::v64i8, Custom);
+ }
}// useAVX512Regs
if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
@@ -28412,6 +28424,46 @@ SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
}
+// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
+uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
+ assert((Amt < 8) && "Shift/Rotation amount out of range");
+ switch (Opcode) {
+ case ISD::BITREVERSE:
+ return 0x8040201008040201ULL;
+ case ISD::CTTZ:
+ // Special case - only works for zero/single bit input.
+ return 0xAACCF0FF00000000ULL;
+ case ISD::SHL:
+ return ((0x0102040810204080ULL >> (Amt)) &
+ (0x0101010101010101ULL * (0xFF >> (Amt))));
+ case ISD::SRL:
+ return ((0x0102040810204080ULL << (Amt)) &
+ (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
+ case ISD::SRA:
+ return (getGFNICtrlImm(ISD::SRL, Amt) |
+ (0x8080808080808080ULL >> (64 - (8 * Amt))));
+ case ISD::ROTL:
+ return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
+ case ISD::ROTR:
+ return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
+ }
+ llvm_unreachable("Unsupported GFNI opcode");
+}
+
+// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
+SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
+ MVT VT, unsigned Amt = 0) {
+ assert(VT.getVectorElementType() == MVT::i8 &&
+ (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
+ uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
+ SmallVector<SDValue> MaskBits;
+ for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
+ uint64_t Bits = (Imm >> (I % 64)) & 255;
+ MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
+ }
+ return DAG.getBuildVector(VT, DL, MaskBits);
+}
+
/// Lower a vector CTLZ using native supported vector CTLZ instruction.
//
// i8/i16 vector implemented using dword LZCNT vector instruction
@@ -28535,6 +28587,11 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
+ // GFNI targets - fold as cttz(bitreverse())
+ if (Subtarget.hasGFNI() && VT.getVectorElementType() == MVT::i8)
+ return DAG.getNode(ISD::CTTZ, DL, VT,
+ DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0)));
+
if (Subtarget.hasCDI() &&
// vXi8 vectors need to be promoted to 512-bits for vXi32.
(Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
@@ -28598,6 +28655,14 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
+ // GFNI - isolate LSB and perform GF2P8AFFINEQB lookup.
+ if (Subtarget.hasGFNI() && VT.isVector()) {
+ SDValue B = DAG.getNode(ISD::AND, dl, VT, N0, DAG.getNegative(N0, dl, VT));
+ SDValue M = getGFNICtrlMask(ISD::CTTZ, DAG, dl, VT);
+ return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, B, M,
+ DAG.getTargetConstant(0x8, dl, MVT::i8));
+ }
+
assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering");
@@ -29597,43 +29662,6 @@ SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
}
-// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
-uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
- assert((Amt < 8) && "Shift/Rotation amount out of range");
- switch (Opcode) {
- case ISD::BITREVERSE:
- return 0x8040201008040201ULL;
- case ISD::SHL:
- return ((0x0102040810204080ULL >> (Amt)) &
- (0x0101010101010101ULL * (0xFF >> (Amt))));
- case ISD::SRL:
- return ((0x0102040810204080ULL << (Amt)) &
- (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
- case ISD::SRA:
- return (getGFNICtrlImm(ISD::SRL, Amt) |
- (0x8080808080808080ULL >> (64 - (8 * Amt))));
- case ISD::ROTL:
- return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
- case ISD::ROTR:
- return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
- }
- llvm_unreachable("Unsupported GFNI opcode");
-}
-
-// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
-SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT,
- unsigned Amt = 0) {
- assert(VT.getVectorElementType() == MVT::i8 &&
- (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
- uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
- SmallVector<SDValue> MaskBits;
- for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
- uint64_t Bits = (Imm >> (I % 64)) & 255;
- MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
- }
- return DAG.getBuildVector(VT, DL, MaskBits);
-}
-
// Return true if the required (according to Opcode) shift-imm form is natively
// supported by the Subtarget
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
diff --git a/llvm/test/CodeGen/X86/gfni-lzcnt.ll b/llvm/test/CodeGen/X86/gfni-lzcnt.ll
index e84af84b36aa9e..5e7894d821d48f 100644
--- a/llvm/test/CodeGen/X86/gfni-lzcnt.ll
+++ b/llvm/test/CodeGen/X86/gfni-lzcnt.ll
@@ -8,40 +8,29 @@
define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; GFNISSE-LABEL: testv16i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: pshufb %xmm0, %xmm2
; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: pxor %xmm3, %xmm3
-; GFNISSE-NEXT: pcmpeqb %xmm0, %xmm3
-; GFNISSE-NEXT: pand %xmm2, %xmm3
-; GFNISSE-NEXT: pshufb %xmm0, %xmm1
-; GFNISSE-NEXT: paddb %xmm3, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
+; GFNISSE-NEXT: pxor %xmm1, %xmm1
+; GFNISSE-NEXT: psubb %xmm0, %xmm1
+; GFNISSE-NEXT: pand %xmm1, %xmm0
+; GFNISSE-NEXT: gf2p8affineqb $8, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1OR2-LABEL: testv16i8:
; GFNIAVX1OR2: # %bb.0:
-; GFNIAVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX1OR2-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm2, %xmm0
+; GFNIAVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: vgf2p8affineqb $8, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: retq
;
; GFNIAVX512-LABEL: testv16i8:
; GFNIAVX512: # %bb.0:
-; GFNIAVX512-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; GFNIAVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX512-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3
-; GFNIAVX512-NEXT: vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; GFNIAVX512-NEXT: vpaddb %xmm0, %xmm2, %xmm0
+; GFNIAVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; GFNIAVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; GFNIAVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
+; GFNIAVX512-NEXT: vgf2p8affineqb $8, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
; GFNIAVX512-NEXT: retq
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
ret <16 x i8> %out
@@ -50,40 +39,29 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; GFNISSE-LABEL: testv16i8u:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: pshufb %xmm0, %xmm2
; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: pxor %xmm3, %xmm3
-; GFNISSE-NEXT: pcmpeqb %xmm0, %xmm3
-; GFNISSE-NEXT: pand %xmm2, %xmm3
-; GFNISSE-NEXT: pshufb %xmm0, %xmm1
-; GFNISSE-NEXT: paddb %xmm3, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
+; GFNISSE-NEXT: pxor %xmm1, %xmm1
+; GFNISSE-NEXT: psubb %xmm0, %xmm1
+; GFNISSE-NEXT: pand %xmm1, %xmm0
+; GFNISSE-NEXT: gf2p8affineqb $8, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1OR2-LABEL: testv16i8u:
; GFNIAVX1OR2: # %bb.0:
-; GFNIAVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX1OR2-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm2, %xmm0
+; GFNIAVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: vgf2p8affineqb $8, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: retq
;
; GFNIAVX512-LABEL: testv16i8u:
; GFNIAVX512: # %bb.0:
-; GFNIAVX512-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; GFNIAVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX512-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3
-; GFNIAVX512-NEXT: vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; GFNIAVX512-NEXT: vpaddb %xmm0, %xmm2, %xmm0
+; GFNIAVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; GFNIAVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; GFNIAVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
+; GFNIAVX512-NEXT: vgf2p8affineqb $8, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
; GFNIAVX512-NEXT: retq
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)
ret <16 x i8> %out
@@ -92,73 +70,52 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; GFNISSE-LABEL: testv32i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: pshufb %xmm0, %xmm3
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0
-; GFNISSE-NEXT: pxor %xmm5, %xmm5
-; GFNISSE-NEXT: movdqa %xmm2, %xmm6
-; GFNISSE-NEXT: pshufb %xmm0, %xmm6
-; GFNISSE-NEXT: pcmpeqb %xmm5, %xmm0
-; GFNISSE-NEXT: pand %xmm3, %xmm0
-; GFNISSE-NEXT: paddb %xmm6, %xmm0
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: pshufb %xmm1, %xmm3
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1
-; GFNISSE-NEXT: pcmpeqb %xmm1, %xmm5
-; GFNISSE-NEXT: pand %xmm3, %xmm5
-; GFNISSE-NEXT: pshufb %xmm1, %xmm2
-; GFNISSE-NEXT: paddb %xmm5, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm1
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE-NEXT: pxor %xmm3, %xmm3
+; GFNISSE-NEXT: pxor %xmm4, %xmm4
+; GFNISSE-NEXT: psubb %xmm0, %xmm4
+; GFNISSE-NEXT: pand %xmm4, %xmm0
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,255,240,204,170,0,0,0,0,255,240,204,170]
+; GFNISSE-NEXT: gf2p8affineqb $8, %xmm4, %xmm0
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1
+; GFNISSE-NEXT: psubb %xmm1, %xmm3
+; GFNISSE-NEXT: pand %xmm3, %xmm1
+; GFNISSE-NEXT: gf2p8affineqb $8, %xmm4, %xmm1
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: testv32i8:
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; GFNIAVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX1-NEXT: # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
-; GFNIAVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
-; GFNIAVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm4
-; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT: # xmm2 = mem[0,0]
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm2, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
+; GFNIAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
+; GFNIAVX1-NEXT: vpsubb %xmm0, %xmm3, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX1-NEXT: vandps %ymm0, %ymm2, %ymm0
+; GFNIAVX1-NEXT: vgf2p8affineqb $8, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; GFNIAVX1-NEXT: retq
;
; GFNIAVX2-LABEL: testv32i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX2-NEXT: # ymm1 = mem[0,1,0,1]
-; GFNIAVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX2-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3
-; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; GFNIAVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
-; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; GFNIAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; GFNIAVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vgf2p8affineqb $8, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; GFNIAVX2-NEXT: retq
;
; GFNIAVX512-LABEL: testv32i8:
; GFNIAVX512: # %bb.0:
-; GFNIAVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX512-NEXT: # ymm1 = mem[0,1,0,1]
-; GFNIAVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2
; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; GFNIAVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX512-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3
-; GFNIAVX512-NEXT: vpand %ymm3, %ymm2, %ymm2
-; GFNIAVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
-; GFNIAVX512-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; GFNIAVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; GFNIAVX512-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; GFNIAVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
+; GFNIAVX512-NEXT: vgf2p8affineqb $8, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; GFNIAVX512-NEXT: retq
%out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 0)
ret <32 x i8> %out
@@ -167,73 +124,52 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; GFNISSE-LABEL: testv32i8u:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: pshufb %xmm0, %xmm3
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0
-; GFNISSE-NEXT: pxor %xmm5, %xmm5
-; GFNISSE-NEXT: movdqa %xmm2, %xmm6
-; GFNISSE-NEXT: pshufb %xmm0, %xmm6
-; GFNISSE-NEXT: pcmpeqb %xmm5, %xmm0
-; GFNISSE-NEXT: pand %xmm3, %xmm0
-; GFNISSE-NEXT: paddb %xmm6, %xmm0
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: pshufb %xmm1, %xmm3
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1
-; GFNISSE-NEXT: pcmpeqb %xmm1, %xmm5
-; GFNISSE-NEXT: pand %xmm3, %xmm5
-; GFNISSE-NEXT: pshufb %xmm1, %xmm2
-; GFNISSE-NEXT: paddb %xmm5, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm1
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE-NEXT: pxor %xmm3, %xmm3
+; GFNISSE-NEXT: pxor %xmm4, %xmm4
+; GFNISSE-NEXT: psubb %xmm0, %xmm4
+; GFNISSE-NEXT: pand %xmm4, %xmm0
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,255,240,204,170,0,0,0,0,255,240,204,170]
+; GFNISSE-NEXT: gf2p8affineqb $8, %xmm4, %xmm0
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1
+; GFNISSE-NEXT: psubb %xmm1, %xmm3
+; GFNISSE-NEXT: pand %xmm3, %xmm1
+; GFNISSE-NEXT: gf2p8affineqb $8, %xmm4, %xmm1
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: testv32i8u:
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; GFNIAVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX1-NEXT: # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
-; GFNIAVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
-; GFNIAVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm4
-; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT: # xmm2 = mem[0,0]
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm2, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
+; GFNIAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
+; GFNIAVX1-NEXT: vpsubb %xmm0, %xmm3, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX1-NEXT: vandps %ymm0, %ymm2, %ymm0
+; GFNIAVX1-NEXT: vgf2p8affineqb $8, {{\.?LCPI[0-9]...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
6cd8b5e to
6b5dae9
Compare
CTTZ can be lowered using GF2P8AFFINEQB if we isolate the lowest set bit (or is zero) and the use GF2P8AFFINEQB to perform a look up With CTTZ, CTLZ can be lowered as CTTZ(BITREVERSE()) As discussed on llvm#110308
6b5dae9 to
d374979
Compare
CTTZ can be lowered using GF2P8AFFINEQB if we isolate the lowest set bit (or is zero) and then use GF2P8AFFINEQB to perform a look up
With CTTZ, CTLZ can be lowered as CTTZ(BITREVERSE())
As discussed on #110308