From 9b477a7bceaf56a915db625d7baaccb90c1c8ae2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 18 Mar 2025 11:46:39 +0000 Subject: [PATCH] [X86] fold AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0) to support AVX512 predicated {k}{z} masks We already do this for the ANDNP(SEXT(SETCC()),X) equivalent pattern. Fixes #109272 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 18 +++++++++ llvm/test/CodeGen/X86/gfni-lzcnt.ll | 20 ++++------ llvm/test/CodeGen/X86/vector-lzcnt-512.ll | 40 ++++++++----------- .../vector-shuffle-combining-avx512vbmi.ll | 18 +++------ 4 files changed, 48 insertions(+), 48 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 49a8f62421f68..dc3f313462a43 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -51347,6 +51347,8 @@ static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; + SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); @@ -51481,6 +51483,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, } } + // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask. + // to make use of predicated selects. + // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0) + if (DCI.isAfterLegalizeDAG() && VT.isVector()) { + SDValue X, Y; + EVT CondVT = VT.changeVectorElementType(MVT::i1); + if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) && + sd_match(N, m_And(m_Value(X), + m_OneUse(m_SExt(m_AllOf( + m_Value(Y), m_SpecificVT(CondVT), + m_SetCC(m_Value(), m_Value(), m_Value()))))))) { + return DAG.getSelect(dl, VT, Y, X, + getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl)); + } + } + // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant // avoids slow variable shift (moving shift amount to ECX etc.) if (isOneConstant(N1) && N0->hasOneUse()) { diff --git a/llvm/test/CodeGen/X86/gfni-lzcnt.ll b/llvm/test/CodeGen/X86/gfni-lzcnt.ll index e84af84b36aa9..8e48950c32cd8 100644 --- a/llvm/test/CodeGen/X86/gfni-lzcnt.ll +++ b/llvm/test/CodeGen/X86/gfni-lzcnt.ll @@ -360,14 +360,12 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; GFNIAVX512BW-LABEL: testv64i8: ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 +; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; GFNIAVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 -; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 -; GFNIAVX512BW-NEXT: vpmovm2b %k0, %zmm1 -; GFNIAVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z} +; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 +; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0) ret <64 x i8> %out @@ -494,14 +492,12 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; GFNIAVX512BW-LABEL: testv64i8u: ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 +; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; GFNIAVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 -; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 -; GFNIAVX512BW-NEXT: vpmovm2b %k0, %zmm1 -; GFNIAVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z} +; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 +; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1) ret <64 x i8> %out diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll index a722a5aee873b..d35a365508d54 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -369,14 +369,12 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -455,14 +453,12 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -561,14 +557,12 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: testv64i8: @@ -651,14 +645,12 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: testv64i8u: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll index 220653e99addb..7d6ca16313583 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -149,10 +149,8 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64 define <64 x i8> @combine_vpermi2var_v64i8_with_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %a2) { ; CHECK-LABEL: combine_vpermi2var_v64i8_with_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vpmovb2m %zmm1, %k0 -; CHECK-NEXT: vpmovm2b %k0, %zmm1 -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpmovb2m %zmm1, %k1 +; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: ret{{[l|q]}} %perm = tail call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %a2) %cmp = icmp slt <64 x i8> %a1, zeroinitializer @@ -177,19 +175,15 @@ define <64 x i8> @combine_vpermi2var_constant_v64i8_with_mask(<64 x i8> %a0) { ; X86-LABEL: combine_vpermi2var_constant_v64i8_with_mask: ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63] -; X86-NEXT: vpermt2b {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm1 -; X86-NEXT: vpmovb2m %zmm0, %k0 -; X86-NEXT: vpmovm2b %k0, %zmm0 -; X86-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; X86-NEXT: vpmovb2m %zmm0, %k1 +; X86-NEXT: vpermi2b {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_constant_v64i8_with_mask: ; X64: # %bb.0: ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63] -; X64-NEXT: vpermt2b {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; X64-NEXT: vpmovb2m %zmm0, %k0 -; X64-NEXT: vpmovm2b %k0, %zmm0 -; X64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; X64-NEXT: vpmovb2m %zmm0, %k1 +; X64-NEXT: vpermi2b {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: retq %perm = tail call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> , <64 x i8> %a0, <64 x i8> ) %cmp = icmp slt <64 x i8> %a0, zeroinitializer