diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index a9ac5ff9b9c89..5dae5f2621a3e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1787,6 +1787,34 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) { if (Instruction *Ashr = foldAddToAshr(I)) return Ashr; + // Ceiling division by power-of-2: + // (X >> log2(N)) + zext(X & (N-1) != 0) --> (X + (N-1)) >> log2(N) + // This is valid when adding (N-1) to X doesn't overflow. + { + Value *X; + const APInt *ShiftAmt, *Mask; + CmpPredicate Pred; + + // Match: (X >> C) + zext((X & Mask) != 0) + // or: zext((X & Mask) != 0) + (X >> C) + if (match(&I, m_c_Add(m_OneUse(m_LShr(m_Value(X), m_APInt(ShiftAmt))), + m_ZExt(m_SpecificICmp( + ICmpInst::ICMP_NE, + m_And(m_Deferred(X), m_LowBitMask(Mask)), + m_ZeroInt())))) && + Mask->popcount() == *ShiftAmt) { + + // Check if X + Mask doesn't overflow + Constant *MaskC = ConstantInt::get(X->getType(), *Mask); + if (willNotOverflowUnsignedAdd(X, MaskC, I)) { + // (X + Mask) >> ShiftAmt + Value *Add = Builder.CreateNUWAdd(X, MaskC); + return BinaryOperator::CreateLShr( + Add, ConstantInt::get(X->getType(), *ShiftAmt)); + } + } + } + // (~X) + (~Y) --> -2 - (X + Y) { // To ensure we can save instructions we need to ensure that we consume both diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll index 495f99824652d..a16e30bb49452 100644 --- a/llvm/test/Transforms/InstCombine/add.ll +++ b/llvm/test/Transforms/InstCombine/add.ll @@ -4273,4 +4273,265 @@ define i32 @fold_zext_nneg_add_const_fail2(i8 %x) { } declare void @llvm.assume(i1) +declare i32 @llvm.ctlz.i32(i32, i1) + +; Ceiling division by power-of-2: (x >> log2(N)) + ((x & (N-1)) != 0) -> (x + (N-1)) >> log2(N) +; This is only valid when x + (N-1) doesn't overflow + +; Test with known range that prevents overflow +define i32 @ceil_div_by_8_known_range(i32 range(i32 0, 100) %x) { +; CHECK-LABEL: @ceil_div_by_8_known_range( +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7 +; CHECK-NEXT: [[R:%.*]] = lshr i32 [[TMP1]], 3 +; CHECK-NEXT: ret i32 [[R]] +; + %shr = lshr i32 %x, 3 + %and = and i32 %x, 7 + %cmp = icmp ne i32 %and, 0 + %ext = zext i1 %cmp to i32 + %r = add i32 %shr, %ext + ret i32 %r +} + +; Test with the exact IR from the original testcase +define i32 @ceil_div_from_clz(i32 %v) { +; CHECK-LABEL: @ceil_div_from_clz( +; CHECK-NEXT: [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = sub nuw nsw i32 39, [[CTLZ]] +; CHECK-NEXT: [[R:%.*]] = lshr i32 [[TMP1]], 3 +; CHECK-NEXT: ret i32 [[R]] +; + %ctlz = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %v, i1 false) + %sub = sub nuw nsw i32 32, %ctlz + %shr = lshr i32 %sub, 3 + %and = and i32 %sub, 7 + %cmp = icmp ne i32 %and, 0 + %ext = zext i1 %cmp to i32 + %r = add nuw nsw i32 %shr, %ext + ret i32 %r +} + +; Vector version with known range +define <2 x i32> @ceil_div_by_8_vec_range(<2 x i32> range(i32 0, 1000) %x) { +; CHECK-LABEL: @ceil_div_by_8_vec_range( +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw <2 x i32> [[X:%.*]], splat (i32 7) +; CHECK-NEXT: [[R:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 3) +; CHECK-NEXT: ret <2 x i32> [[R]] +; + %shr = lshr <2 x i32> %x, + %and = and <2 x i32> %x, + %cmp = icmp ne <2 x i32> %and, + %ext = zext <2 x i1> %cmp to <2 x i32> + %r = add <2 x i32> %shr, %ext + ret <2 x i32> %r +} + +; Ceiling division by 16 with known range +define i16 @ceil_div_by_16_i16(i16 range(i16 0, 1000) %x) { +; CHECK-LABEL: @ceil_div_by_16_i16( +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i16 [[X:%.*]], 15 +; CHECK-NEXT: [[R:%.*]] = lshr i16 [[TMP1]], 4 +; CHECK-NEXT: ret i16 [[R]] +; + %shr = lshr i16 %x, 4 + %and = and i16 %x, 15 + %cmp = icmp ne i16 %and, 0 + %ext = zext i1 %cmp to i16 + %r = add i16 %shr, %ext + ret i16 %r +} + +; Negative test: no overflow guarantee - should NOT optimize +define i32 @ceil_div_by_8_no_overflow_info(i32 %x) { +; CHECK-LABEL: @ceil_div_by_8_no_overflow_info( +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32 +; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]] +; CHECK-NEXT: ret i32 [[R]] +; + %shr = lshr i32 %x, 3 + %and = and i32 %x, 7 + %cmp = icmp ne i32 %and, 0 + %ext = zext i1 %cmp to i32 + %r = add i32 %shr, %ext + ret i32 %r +} + +; Negative test: nuw on final add doesn't help +define i32 @ceil_div_by_8_only_nuw_on_add(i32 %x) { +; CHECK-LABEL: @ceil_div_by_8_only_nuw_on_add( +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32 +; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]] +; CHECK-NEXT: ret i32 [[R]] +; + %shr = lshr i32 %x, 3 + %and = and i32 %x, 7 + %cmp = icmp ne i32 %and, 0 + %ext = zext i1 %cmp to i32 + %r = add nuw i32 %shr, %ext ; nuw here doesn't prove x+7 won't overflow + ret i32 %r +} + +; Negative test: wrong mask +define i32 @ceil_div_wrong_mask(i32 range(i32 0, 100) %x) { +; CHECK-LABEL: @ceil_div_wrong_mask( +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 6 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32 +; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]] +; CHECK-NEXT: ret i32 [[R]] +; + %shr = lshr i32 %x, 3 + %and = and i32 %x, 6 ; Wrong mask: should be 7 + %cmp = icmp ne i32 %and, 0 + %ext = zext i1 %cmp to i32 + %r = add i32 %shr, %ext + ret i32 %r +} + +; Negative test: wrong shift amount +define i32 @ceil_div_wrong_shift(i32 range(i32 0, 100) %x) { +; CHECK-LABEL: @ceil_div_wrong_shift( +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 4 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32 +; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]] +; CHECK-NEXT: ret i32 [[R]] +; + %shr = lshr i32 %x, 4 ; Shift by 4, but mask is 7 (should be 15) + %and = and i32 %x, 7 + %cmp = icmp ne i32 %and, 0 + %ext = zext i1 %cmp to i32 + %r = add i32 %shr, %ext + ret i32 %r +} + +; Negative test: wrong comparison +define i32 @ceil_div_wrong_cmp(i32 range(i32 0, 100) %x) { +; CHECK-LABEL: @ceil_div_wrong_cmp( +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32 +; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]] +; CHECK-NEXT: ret i32 [[R]] +; + %shr = lshr i32 %x, 3 + %and = and i32 %x, 7 + %cmp = icmp eq i32 %and, 0 ; Wrong: should be ne + %ext = zext i1 %cmp to i32 + %r = add i32 %shr, %ext + ret i32 %r +} + +; Multi-use test: all intermediate values have uses +define i32 @ceil_div_multi_use(i32 range(i32 0, 100) %x) { +; CHECK-LABEL: @ceil_div_multi_use( +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3 +; CHECK-NEXT: call void @use_i32(i32 [[SHR]]) +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7 +; CHECK-NEXT: call void @use_i32(i32 [[AND]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32 +; CHECK-NEXT: call void @use_i32(i32 [[EXT]]) +; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]] +; CHECK-NEXT: ret i32 [[R]] +; + %shr = lshr i32 %x, 3 + call void @use_i32(i32 %shr) + %and = and i32 %x, 7 + call void @use_i32(i32 %and) + %cmp = icmp ne i32 %and, 0 + %ext = zext i1 %cmp to i32 + call void @use_i32(i32 %ext) + %r = add i32 %shr, %ext + ret i32 %r +} + +; Commuted test: add operands are swapped +define i32 @ceil_div_commuted(i32 range(i32 0, 100) %x) { +; CHECK-LABEL: @ceil_div_commuted( +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7 +; CHECK-NEXT: [[R:%.*]] = lshr i32 [[TMP1]], 3 +; CHECK-NEXT: ret i32 [[R]] +; + %shr = lshr i32 %x, 3 + %and = and i32 %x, 7 + %cmp = icmp ne i32 %and, 0 + %ext = zext i1 %cmp to i32 + %r = add i32 %ext, %shr ; Operands swapped + ret i32 %r +} + +; Commuted with multi-use +define i32 @ceil_div_commuted_multi_use(i32 range(i32 0, 100) %x) { +; CHECK-LABEL: @ceil_div_commuted_multi_use( +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3 +; CHECK-NEXT: call void @use_i32(i32 [[SHR]]) +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32 +; CHECK-NEXT: call void @use_i32(i32 [[EXT]]) +; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]] +; CHECK-NEXT: ret i32 [[R]] +; + %shr = lshr i32 %x, 3 + call void @use_i32(i32 %shr) + %and = and i32 %x, 7 + %cmp = icmp ne i32 %and, 0 + %ext = zext i1 %cmp to i32 + call void @use_i32(i32 %ext) + %r = add i32 %ext, %shr ; Operands swapped + ret i32 %r +} + +; Multi-use test where only zext has multiple uses - should still optimize +define i32 @ceil_div_zext_multi_use(i32 range(i32 0, 100) %x) { +; CHECK-LABEL: @ceil_div_zext_multi_use( +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 7 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32 +; CHECK-NEXT: call void @use_i32(i32 [[EXT]]) +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[X]], 7 +; CHECK-NEXT: [[R:%.*]] = lshr i32 [[TMP1]], 3 +; CHECK-NEXT: ret i32 [[R]] +; + %shr = lshr i32 %x, 3 + %and = and i32 %x, 7 + %cmp = icmp ne i32 %and, 0 + %ext = zext i1 %cmp to i32 + call void @use_i32(i32 %ext) + %r = add i32 %shr, %ext + ret i32 %r +} + +; Multi-use with vector type +define <2 x i32> @ceil_div_vec_multi_use(<2 x i32> range(i32 0, 1000) %x) { +; CHECK-LABEL: @ceil_div_vec_multi_use( +; CHECK-NEXT: [[SHR:%.*]] = lshr <2 x i32> [[X:%.*]], splat (i32 3) +; CHECK-NEXT: call void @use_vec(<2 x i32> [[SHR]]) +; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[X]], splat (i32 7) +; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer +; CHECK-NEXT: [[EXT:%.*]] = zext <2 x i1> [[CMP]] to <2 x i32> +; CHECK-NEXT: [[R:%.*]] = add nuw nsw <2 x i32> [[SHR]], [[EXT]] +; CHECK-NEXT: ret <2 x i32> [[R]] +; + %shr = lshr <2 x i32> %x, + call void @use_vec(<2 x i32> %shr) + %and = and <2 x i32> %x, + %cmp = icmp ne <2 x i32> %and, + %ext = zext <2 x i1> %cmp to <2 x i32> + %r = add <2 x i32> %shr, %ext + ret <2 x i32> %r +} + +declare void @use_i32(i32) +declare void @use_vec(<2 x i32>) declare void @fake_func(i32)