diff --git a/llvm/test/Transforms/InstCombine/fold-binop-of-reductions.ll b/llvm/test/Transforms/InstCombine/fold-binop-of-reductions.ll new file mode 100644 index 0000000000000..86f17cdfb79b4 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/fold-binop-of-reductions.ll @@ -0,0 +1,215 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +define i32 @add_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) { +; CHECK-LABEL: define i32 @add_of_reduce_add( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1) + %res = add i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @sub_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) { +; CHECK-LABEL: define i32 @sub_of_reduce_add( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = sub <16 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1) + %res = sub i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @mul_of_reduce_mul(<16 x i32> %v0, <16 x i32> %v1) { +; CHECK-LABEL: define i32 @mul_of_reduce_mul( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = mul i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v1) + %res = mul i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @and_of_reduce_and(<16 x i32> %v0, <16 x i32> %v1) { +; CHECK-LABEL: define i32 @and_of_reduce_and( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = and i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v1) + %res = and i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @or_of_reduce_or(<16 x i32> %v0, <16 x i32> %v1) { +; CHECK-LABEL: define i32 @or_of_reduce_or( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = or i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v1) + %res = or i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @xor_of_reduce_xor(<16 x i32> %v0, <16 x i32> %v1) { +; CHECK-LABEL: define i32 @xor_of_reduce_xor( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = xor i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v1) + %res = xor i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @reduction_does_not_match_binop(<16 x i32> %v0, <16 x i32> %v1) { +; CHECK-LABEL: define i32 @reduction_does_not_match_binop( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v1) + %res = add i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @intrinsics_do_not_match(<16 x i32> %v0, <16 x i32> %v1) { +; CHECK-LABEL: define i32 @intrinsics_do_not_match( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v1) + %res = add i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @element_counts_do_not_match(<16 x i32> %v0, <8 x i32> %v1) { +; CHECK-LABEL: define i32 @element_counts_do_not_match( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v1) + %res = add i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @multiple_use_of_reduction_0(<16 x i32> %v0, <16 x i32> %v1, ptr %p) { +; CHECK-LABEL: define i32 @multiple_use_of_reduction_0( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: store i32 [[V0_RED]], ptr [[P]], align 4 +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1) + %res = add i32 %v0_red, %v1_red + store i32 %v0_red, ptr %p + ret i32 %res +} + +define i32 @multiple_use_of_reduction_1(<16 x i32> %v0, <16 x i32> %v1, ptr %p) { +; CHECK-LABEL: define i32 @multiple_use_of_reduction_1( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: store i32 [[V1_RED]], ptr [[P]], align 4 +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1) + %res = add i32 %v0_red, %v1_red + store i32 %v1_red, ptr %p + ret i32 %res +} + +define i32 @do_not_preserve_overflow_flags(<16 x i32> %v0, <16 x i32> %v1) { +; CHECK-LABEL: define i32 @do_not_preserve_overflow_flags( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1) + %res = add nsw nuw i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @preserve_disjoint_flags(<16 x i32> %v0, <16 x i32> %v1) { +; CHECK-LABEL: define i32 @preserve_disjoint_flags( +; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = or disjoint i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0) + %v1_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v1) + %res = or disjoint i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @add_of_reduce_add_vscale( %v0, %v1) { +; CHECK-LABEL: define i32 @add_of_reduce_add_vscale( +; CHECK-SAME: [[V0:%.*]], [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32( [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32( [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.add.nxv16i32( %v0) + %v1_red = tail call i32 @llvm.vector.reduce.add.nxv16i32( %v1) + %res = add i32 %v0_red, %v1_red + ret i32 %res +} + +define i32 @element_counts_do_not_match_vscale( %v0, %v1) { +; CHECK-LABEL: define i32 @element_counts_do_not_match_vscale( +; CHECK-SAME: [[V0:%.*]], [[V1:%.*]]) { +; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32( [[V0]]) +; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv8i32( [[V1]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]] +; CHECK-NEXT: ret i32 [[RES]] +; + %v0_red = tail call i32 @llvm.vector.reduce.add.nxv16i32( %v0) + %v1_red = tail call i32 @llvm.vector.reduce.add.nxv16i32( %v1) + %res = add i32 %v0_red, %v1_red + ret i32 %res +}