[InstCombine] Add folds for (add/sub/disjoint_or/icmp C, (ctpop (not x)))

goldsteinn · goldsteinn · commit 60e8915d2277 · 2024-01-15T12:05:38.000-08:00
`(ctpop (not x))` <-> `(sub nuw nsw BitWidth(x), (ctpop x))`. The `sub` expression can sometimes be constant folded depending on the use case of `(ctpop (not x))`. This patch adds fold for the following cases: `(add/sub/disjoint_or C, (ctpop (not x))` -> `(add/sub/disjoint_or C', (ctpop x))` `(cmp pred C, (ctpop (not x))` -> `(cmp swapped_pred C', (ctpop x))` Where `C'` depends on how we constant fold `C` with `BitWidth(x)` for the given opcode. Proofs: https://alive2.llvm.org/ce/z/qUgfF3 Closes #77859
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1683,6 +1683,9 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
     }
   }
 
+  if (Instruction *R = tryFoldInstWithCtpopWithNot(&I))
+    return R;
+
   // TODO(jingyue): Consider willNotOverflowSignedAdd and
   // willNotOverflowUnsignedAdd to reduce the number of invocations of
   // computeKnownBits.
@@ -2445,6 +2448,9 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     }
   }
 
+  if (Instruction *R = tryFoldInstWithCtpopWithNot(&I))
+    return R;
+
   if (Instruction *R = foldSubOfMinMax(I, Builder))
     return R;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3398,6 +3398,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Instruction *R = foldBinOpShiftWithShift(I))
     return R;
 
+  if (Instruction *R = tryFoldInstWithCtpopWithNot(&I))
+    return R;
+
   Value *X, *Y;
   const APInt *CV;
   if (match(&I, m_c_Or(m_OneUse(m_Xor(m_Value(X), m_APInt(CV))), m_Value(Y))) &&
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1323,6 +1323,9 @@ Instruction *InstCombinerImpl::foldICmpWithConstant(ICmpInst &Cmp) {
       return replaceInstUsesWith(Cmp, NewPhi);
     }
 
+  if (Instruction *R = tryFoldInstWithCtpopWithNot(&Cmp))
+    return R;
+
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -505,6 +505,10 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS,
                                         Value *RHS);
 
+  // If `I` has operand `(ctpop (not x))`, fold `I` with `(sub nuw nsw
+  // BitWidth(x), (ctpop x))`.
+  Instruction *tryFoldInstWithCtpopWithNot(Instruction *I);
+
   // (Binop1 (Binop2 (logic_shift X, C), C1), (logic_shift Y, C))
   //    -> (logic_shift (Binop1 (Binop2 X, inv_logic_shift(C1, C)), Y), C)
   // (Binop1 (Binop2 (logic_shift X, Amt), Mask), (logic_shift Y, Amt))
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -740,6 +740,93 @@ static Value *tryFactorization(BinaryOperator &I, const SimplifyQuery &SQ,
   return RetVal;
 }
 
+// If `I` has one Const operand and the other matches `(ctpop (not x))`,
+// replace `(ctpop (not x))` with `(sub nuw nsw BitWidth(x), (ctpop x))`.
+// This is only useful is the new subtract can fold so we only handle the
+// following cases:
+//    1) (add/sub/disjoint_or C, (ctpop (not x))
+//        -> (add/sub/disjoint_or C', (ctpop x))
+//    1) (cmp pred C, (ctpop (not x))
+//        -> (cmp pred C', (ctpop x))
+Instruction *InstCombinerImpl::tryFoldInstWithCtpopWithNot(Instruction *I) {
+  unsigned Opc = I->getOpcode();
+  unsigned ConstIdx = 1;
+  switch (Opc) {
+  default:
+    return nullptr;
+    // (ctpop (not x)) <-> (sub nuw nsw BitWidth(x) - (ctpop x))
+    // We can fold the BitWidth(x) with add/sub/icmp as long the other operand
+    // is constant.
+  case Instruction::Sub:
+    ConstIdx = 0;
+    break;
+  case Instruction::ICmp:
+    // Signed predicates aren't correct in some edge cases like for i2 types, as
+    // well since (ctpop x) is known [0, log2(BitWidth(x))] almost all signed
+    // comparisons against it are simplfied to unsigned.
+    if (cast<ICmpInst>(I)->isSigned())
+      return nullptr;
+    break;
+  case Instruction::Or:
+    if (!match(I, m_DisjointOr(m_Value(), m_Value())))
+      return nullptr;
+    [[fallthrough]];
+  case Instruction::Add:
+    break;
+  }
+
+  Value *Op;
+  // Find ctpop.
+  if (!match(I->getOperand(1 - ConstIdx),
+             m_OneUse(m_Intrinsic<Intrinsic::ctpop>(m_Value(Op)))))
+    return nullptr;
+
+  Constant *C;
+  // Check other operand is ImmConstant.
+  if (!match(I->getOperand(ConstIdx), m_ImmConstant(C)))
+    return nullptr;
+
+  Type *Ty = Op->getType();
+  Constant *BitWidthC = ConstantInt::get(Ty, Ty->getScalarSizeInBits());
+  // Need extra check for icmp. Note if this check is true, it generally means
+  // the icmp will simplify to true/false.
+  if (Opc == Instruction::ICmp && !cast<ICmpInst>(I)->isEquality() &&
+      !ConstantExpr::getICmp(ICmpInst::ICMP_UGT, C, BitWidthC)->isZeroValue())
+    return nullptr;
+
+  // Check we can invert `(not x)` for free.
+  bool Consumes = false;
+  if (!isFreeToInvert(Op, Op->hasOneUse(), Consumes) || !Consumes)
+    return nullptr;
+  Value *NotOp = getFreelyInverted(Op, Op->hasOneUse(), &Builder);
+  assert(NotOp != nullptr &&
+         "Desync between isFreeToInvert and getFreelyInverted");
+
+  Value *CtpopOfNotOp = Builder.CreateIntrinsic(Ty, Intrinsic::ctpop, NotOp);
+
+  Value *R = nullptr;
+
+  // Do the transformation here to avoid potentially introducing an infinite
+  // loop.
+  switch (Opc) {
+  case Instruction::Sub:
+    R = Builder.CreateAdd(CtpopOfNotOp, ConstantExpr::getSub(C, BitWidthC));
+    break;
+  case Instruction::Or:
+  case Instruction::Add:
+    R = Builder.CreateSub(ConstantExpr::getAdd(C, BitWidthC), CtpopOfNotOp);
+    break;
+  case Instruction::ICmp:
+    R = Builder.CreateICmp(cast<ICmpInst>(I)->getSwappedPredicate(),
+                           CtpopOfNotOp, ConstantExpr::getSub(BitWidthC, C));
+    break;
+  default:
+    llvm_unreachable("Unhandled Opcode");
+  }
+  assert(R != nullptr);
+  return replaceInstUsesWith(*I, R);
+}
+
 // (Binop1 (Binop2 (logic_shift X, C), C1), (logic_shift Y, C))
 //   IFF
 //    1) the logic_shifts match
diff --git a/llvm/test/Transforms/InstCombine/fold-ctpop-of-not.ll b/llvm/test/Transforms/InstCombine/fold-ctpop-of-not.ll
@@ -8,9 +8,8 @@ declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>)
 
 define i8 @fold_sub_c_ctpop(i8 %x) {
 ; CHECK-LABEL: @fold_sub_c_ctpop(
-; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctpop.i8(i8 [[NX]]), !range [[RNG0:![0-9]+]]
-; CHECK-NEXT:    [[R:%.*]] = sub nuw nsw i8 12, [[CNT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.ctpop.i8(i8 [[X:%.*]]), !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i8 [[TMP1]], 4
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %nx = xor i8 %x, -1
@@ -34,9 +33,8 @@ define i8 @fold_sub_var_ctpop_fail(i8 %x, i8 %y) {
 
 define <2 x i8> @fold_sub_ctpop_c(<2 x i8> %x) {
 ; CHECK-LABEL: @fold_sub_ctpop_c(
-; CHECK-NEXT:    [[NX:%.*]] = xor <2 x i8> [[X:%.*]], <i8 -1, i8 -1>
-; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[NX]]), !range [[RNG0]]
-; CHECK-NEXT:    [[R:%.*]] = add nuw nsw <2 x i8> [[CNT]], <i8 -63, i8 -64>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]]), !range [[RNG0]]
+; CHECK-NEXT:    [[R:%.*]] = sub nuw nsw <2 x i8> <i8 -55, i8 -56>, [[TMP1]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %nx = xor <2 x i8> %x, <i8 -1, i8 -1>
@@ -47,9 +45,8 @@ define <2 x i8> @fold_sub_ctpop_c(<2 x i8> %x) {
 
 define i8 @fold_add_ctpop_c(i8 %x) {
 ; CHECK-LABEL: @fold_add_ctpop_c(
-; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctpop.i8(i8 [[NX]]), !range [[RNG0]]
-; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i8 [[CNT]], 63
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.ctpop.i8(i8 [[X:%.*]]), !range [[RNG0]]
+; CHECK-NEXT:    [[R:%.*]] = sub nuw nsw i8 71, [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %nx = xor i8 %x, -1
@@ -60,9 +57,8 @@ define i8 @fold_add_ctpop_c(i8 %x) {
 
 define i8 @fold_distjoint_or_ctpop_c(i8 %x) {
 ; CHECK-LABEL: @fold_distjoint_or_ctpop_c(
-; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctpop.i8(i8 [[NX]]), !range [[RNG0]]
-; CHECK-NEXT:    [[R:%.*]] = or disjoint i8 [[CNT]], 64
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.ctpop.i8(i8 [[X:%.*]]), !range [[RNG0]]
+; CHECK-NEXT:    [[R:%.*]] = sub nuw nsw i8 72, [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %nx = xor i8 %x, -1
@@ -109,9 +105,8 @@ define i1 @fold_icmp_sgt_ctpop_c_i2_fail(i2 %x, i2 %C) {
 
 define i1 @fold_cmp_eq_ctpop_c(i8 %x) {
 ; CHECK-LABEL: @fold_cmp_eq_ctpop_c(
-; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctpop.i8(i8 [[NX]]), !range [[RNG0]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[CNT]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.ctpop.i8(i8 [[X:%.*]]), !range [[RNG0]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[TMP1]], 6
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %nx = xor i8 %x, -1
@@ -137,9 +132,8 @@ define i1 @fold_cmp_eq_ctpop_c_multiuse_fail(i8 %x) {
 
 define <2 x i1> @fold_cmp_ne_ctpop_c(<2 x i8> %x) {
 ; CHECK-LABEL: @fold_cmp_ne_ctpop_c(
-; CHECK-NEXT:    [[NX:%.*]] = xor <2 x i8> [[X:%.*]], <i8 -1, i8 -1>
-; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[NX]]), !range [[RNG0]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i8> [[CNT]], <i8 44, i8 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]]), !range [[RNG0]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i8> [[TMP1]], <i8 -36, i8 5>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %nx = xor <2 x i8> %x, <i8 -1, i8 -1>
@@ -163,11 +157,10 @@ define <2 x i1> @fold_cmp_ne_ctpop_var_fail(<2 x i8> %x, <2 x i8> %y) {
 
 define i1 @fold_cmp_ult_ctpop_c(i8 %x, i8 %y, i1 %cond) {
 ; CHECK-LABEL: @fold_cmp_ult_ctpop_c(
-; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[NY:%.*]] = add i8 [[Y:%.*]], 15
-; CHECK-NEXT:    [[N:%.*]] = select i1 [[COND:%.*]], i8 [[NX]], i8 [[NY]]
-; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctpop.i8(i8 [[N]]), !range [[RNG0]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[CNT]], 5
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i8 -16, [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND:%.*]], i8 [[X:%.*]], i8 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.ctpop.i8(i8 [[TMP2]]), !range [[RNG0]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[TMP3]], 3
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %nx = xor i8 %x, -1
@@ -180,11 +173,10 @@ define i1 @fold_cmp_ult_ctpop_c(i8 %x, i8 %y, i1 %cond) {
 
 define i1 @fold_cmp_sle_ctpop_c(i8 %x, i8 %y, i1 %cond) {
 ; CHECK-LABEL: @fold_cmp_sle_ctpop_c(
-; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[NY:%.*]] = add i8 [[Y:%.*]], 15
-; CHECK-NEXT:    [[N:%.*]] = select i1 [[COND:%.*]], i8 [[NX]], i8 [[NY]]
-; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctpop.i8(i8 [[N]]), !range [[RNG0]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[CNT]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i8 -16, [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND:%.*]], i8 [[X:%.*]], i8 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.ctpop.i8(i8 [[TMP2]]), !range [[RNG0]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[TMP3]], 4
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %nx = xor i8 %x, -1
@@ -210,9 +202,8 @@ define i1 @fold_cmp_ult_ctpop_c_no_not_inst_save_fail(i8 %x) {
 
 define <2 x i1> @fold_cmp_ugt_ctpop_c(<2 x i8> %x) {
 ; CHECK-LABEL: @fold_cmp_ugt_ctpop_c(
-; CHECK-NEXT:    [[NX:%.*]] = xor <2 x i8> [[X:%.*]], <i8 -1, i8 -1>
-; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[NX]]), !range [[RNG0]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt <2 x i8> [[CNT]], <i8 8, i8 6>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]]), !range [[RNG0]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ult <2 x i8> [[TMP1]], <i8 0, i8 2>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %nx = xor <2 x i8> %x, <i8 -1, i8 -1>

Original file line number	Diff line number	Diff line change
`@@ -1683,6 +1683,9 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {`
`1683`	`1683`	`}`
`1684`	`1684`	`}`
`1685`	`1685`
	`1686`	`+ if (Instruction *R = tryFoldInstWithCtpopWithNot(&I))`
	`1687`	`+ return R;`
	`1688`	`+`
`1686`	`1689`	`// TODO(jingyue): Consider willNotOverflowSignedAdd and`
`1687`	`1690`	`// willNotOverflowUnsignedAdd to reduce the number of invocations of`
`1688`	`1691`	`// computeKnownBits.`
`@@ -2445,6 +2448,9 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {`
`2445`	`2448`	`}`
`2446`	`2449`	`}`
`2447`	`2450`
	`2451`	`+ if (Instruction *R = tryFoldInstWithCtpopWithNot(&I))`
	`2452`	`+ return R;`
	`2453`	`+`
`2448`	`2454`	`if (Instruction *R = foldSubOfMinMax(I, Builder))`
`2449`	`2455`	`return R;`
`2450`	`2456`
Original file line number	Diff line number	Diff line change
`@@ -1323,6 +1323,9 @@ Instruction *InstCombinerImpl::foldICmpWithConstant(ICmpInst &Cmp) {`
`1323`	`1323`	`return replaceInstUsesWith(Cmp, NewPhi);`
`1324`	`1324`	`}`
`1325`	`1325`
	`1326`	`+ if (Instruction *R = tryFoldInstWithCtpopWithNot(&Cmp))`
	`1327`	`+ return R;`
	`1328`	`+`
`1326`	`1329`	`return nullptr;`
`1327`	`1330`	`}`
`1328`	`1331`