From cf8120e4b585ef47831efb065361844afb1eac17 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 10 Jan 2025 16:18:18 +0700 Subject: [PATCH 1/6] DAG: Fix vector bin op scalarize defining a partially undef vector This avoids some of the pending regressions after AMDGPU implements isExtractVecEltCheap. In a case like shl , splat k, because the second operand was fully defined, we would fall through and use the splat value for the first operand, losing the undef high bits. This would result in an additional instruction to handle the high bits. Add some reduced testcases for different opcodes for one of the regressions. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 +- llvm/test/CodeGen/AMDGPU/trunc-combine.ll | 331 ++++++++++++++++++ 2 files changed, 337 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6805e0cb23ace..4f3725accc10c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -27534,8 +27534,12 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG, // If all lanes but 1 are undefined, no need to splat the scalar result. // TODO: Keep track of undefs and use that info in the general case. if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() && - count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 && - count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) { + // This is assuming if either input is undef, the result will fold out. + // + // TODO: Do we need to check if the opcode/operand propagates undef? + // Should we ignore operation identity values? + ((count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) || + (count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1))) { // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) --> // build_vec ..undef, (bo X, Y), undef... SmallVector Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT)); diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index aa3e05fdbdb36..02e30b6c68e99 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -156,3 +156,334 @@ define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 { %trunc = trunc <2 x i64> %arg0 to <2 x i16> ret <2 x i16> %trunc } + +; Test for regression where an unnecessary v_alignbit_b32 was inserted +; on the final result, due to losing the fact that the upper half of +; the lhs vector was undef. +define <2 x i16> @vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = lshr <2 x i32> %undef.hi.elt, splat (i32 16) + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshr_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e64 v0, v0, 16 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = lshr <2 x i32> splat (i32 16), %undef.hi.elt + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %ashr = ashr <2 x i32> %undef.hi.elt, splat (i32 16) + %trunc = trunc <2 x i32> %ashr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_ashr_i32_e32 v0, -4, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_ashrrev_i32_e64 v0, v0, -4 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = ashr <2 x i32> splat (i32 -4), %undef.hi.elt + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_add_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_add_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_add_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = add <2 x i32> %undef.hi.elt, splat (i32 16) + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_shl_rhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_shl_rhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshl_b32_e32 v0, 2, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xfffe, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_shl_rhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 2 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = shl <2 x i32> splat (i32 2), %undef.hi.elt + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_sub_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_sub_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_sub_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = sub <2 x i32> %undef.hi.elt, splat (i32 16) + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_or_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0xffff +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = or <2 x i32> %undef.hi.elt, splat (i32 17) + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_xor_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_xor_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_xor_b32_e32 v0, 17, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_xor_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_xor_b32_e32 v0, 17, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = xor <2 x i32> %undef.hi.elt, splat (i32 17) + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_shl_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_shl_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xfffc, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_shl_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v0, 2, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %shl = shl <2 x i32> %undef.hi.elt, splat (i32 2) + %trunc = trunc <2 x i32> %shl to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_mul_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mul_lo_u32 v0, v0, 18 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_and_b32_e32 v0, 0xfffe, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_lo_u32 v0, v0, 18 +; VI-NEXT: v_and_b32_e32 v0, 0xfffe, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = mul <2 x i32> %undef.hi.elt, splat (i32 18) + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x38e38e39 +; SI-NEXT: v_mul_hi_i32 v0, v0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 31, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0x38e38e39 +; VI-NEXT: v_mul_hi_i32 v0, v0, s4 +; VI-NEXT: v_lshrrev_b32_e32 v1, 31, v0 +; VI-NEXT: v_ashrrev_i32_e32 v0, 2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = sdiv <2 x i32> %undef.hi.elt, splat (i32 18) + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_srem_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_srem_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x38e38e39 +; SI-NEXT: v_mul_hi_i32 v1, v0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 2, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; SI-NEXT: v_mul_lo_u32 v1, v1, 18 +; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_srem_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0x38e38e39 +; VI-NEXT: v_mul_hi_i32 v1, v0, s4 +; VI-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; VI-NEXT: v_ashrrev_i32_e32 v1, 2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; VI-NEXT: v_mul_lo_u32 v1, v1, 18 +; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = srem <2 x i32> %undef.hi.elt, splat (i32 18) + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + + +define <2 x i16> @vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x38e38e39 +; SI-NEXT: v_mul_hi_u32 v0, v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_bfe_u32 v0, v0, 2, 16 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0x38e38e39 +; VI-NEXT: v_mul_hi_u32 v0, v0, s4 +; VI-NEXT: v_lshrrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = udiv <2 x i32> %undef.hi.elt, splat (i32 18) + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} + +define <2 x i16> @vector_trunc_high_bits_undef_urem_lhs_alignbit_regression(i32 %arg0) { +; SI-LABEL: vector_trunc_high_bits_undef_urem_lhs_alignbit_regression: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x38e38e39 +; SI-NEXT: v_mul_hi_u32 v1, v0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 2, v1 +; SI-NEXT: v_mul_lo_u32 v1, v1, 18 +; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: vector_trunc_high_bits_undef_urem_lhs_alignbit_regression: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0x38e38e39 +; VI-NEXT: v_mul_hi_u32 v1, v0, s4 +; VI-NEXT: v_lshrrev_b32_e32 v1, 2, v1 +; VI-NEXT: v_mul_lo_u32 v1, v1, 18 +; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] + %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 + %lshr = urem <2 x i32> %undef.hi.elt, splat (i32 18) + %trunc = trunc <2 x i32> %lshr to <2 x i16> + ret <2 x i16> %trunc +} From a4a1f28bd58198a87ea192a870f81f70059829d7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 14 Jan 2025 21:21:46 +0700 Subject: [PATCH 2/6] Just use insert_vector_elt --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4f3725accc10c..bc9ec53cb4e27 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -27533,18 +27533,11 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG, // If all lanes but 1 are undefined, no need to splat the scalar result. // TODO: Keep track of undefs and use that info in the general case. - if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() && - // This is assuming if either input is undef, the result will fold out. - // - // TODO: Do we need to check if the opcode/operand propagates undef? - // Should we ignore operation identity values? - ((count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) || - (count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1))) { + if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) { // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) --> - // build_vec ..undef, (bo X, Y), undef... - SmallVector Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT)); - Ops[Index0] = ScalarBO; - return DAG.getBuildVector(VT, DL, Ops); + // insert_vector_elt undef, (bo X, Y), index + return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, DAG.getUNDEF(VT), + ScalarBO, IndexC); } // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index From 351dfdfb6c12e137f94a900afa111049accdc6b2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 15 Jan 2025 10:44:34 +0700 Subject: [PATCH 3/6] Check if getNode folded to undef instead --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bc9ec53cb4e27..04dce63155025 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -27536,8 +27536,24 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG, if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) { // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) --> // insert_vector_elt undef, (bo X, Y), index - return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, DAG.getUNDEF(VT), - ScalarBO, IndexC); + + SmallVector EltsX, EltsY; + DAG.ExtractVectorElements(Src0, EltsX); + DAG.ExtractVectorElements(Src1, EltsY); + + SmallVector EltsResult; + + unsigned NonUndefElements = 0; + for (auto [X, Y] : zip(EltsX, EltsY)) { + SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()); + if (!ScalarBO.isUndef()) + if (NonUndefElements++ > 1) + break; + EltsResult.push_back(ScalarBO); + } + + if (NonUndefElements == 1) + return DAG.getBuildVector(VT, DL, EltsResult); } // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index From 70d7ce12230e70813a5eee17506609b7eabb29f8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 15 Jan 2025 10:47:41 +0700 Subject: [PATCH 4/6] Don't bother counting the undefs It folds to undef or a constant and seems to have the best result anyway rather than overdefining by splatting the variable value. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 31 ++++++------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 04dce63155025..19b3355681df7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -27526,36 +27526,25 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG, if ((Opcode == ISD::MULHS || Opcode == ISD::MULHU) && !TLI.isTypeLegal(EltVT)) return SDValue(); - SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL); - SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC); - SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC); - SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()); - // If all lanes but 1 are undefined, no need to splat the scalar result. // TODO: Keep track of undefs and use that info in the general case. if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) { // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) --> - // insert_vector_elt undef, (bo X, Y), index - - SmallVector EltsX, EltsY; + // build_vec ..undef, (bo X, Y), undef... + SmallVector EltsX, EltsY, EltsResult; DAG.ExtractVectorElements(Src0, EltsX); DAG.ExtractVectorElements(Src1, EltsY); - SmallVector EltsResult; - - unsigned NonUndefElements = 0; - for (auto [X, Y] : zip(EltsX, EltsY)) { - SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()); - if (!ScalarBO.isUndef()) - if (NonUndefElements++ > 1) - break; - EltsResult.push_back(ScalarBO); - } - - if (NonUndefElements == 1) - return DAG.getBuildVector(VT, DL, EltsResult); + for (auto [X, Y] : zip(EltsX, EltsY)) + EltsResult.push_back(DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags())); + return DAG.getBuildVector(VT, DL, EltsResult); } + SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL); + SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC); + SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC); + SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()); + // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index return DAG.getSplat(VT, DL, ScalarBO); } From da27407f430cc6ef990880798a1a4971c2863ba4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 16 Jan 2025 18:42:15 +0700 Subject: [PATCH 5/6] Update comment --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 19b3355681df7..f4c04756e1305 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -27526,8 +27526,8 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG, if ((Opcode == ISD::MULHS || Opcode == ISD::MULHU) && !TLI.isTypeLegal(EltVT)) return SDValue(); - // If all lanes but 1 are undefined, no need to splat the scalar result. - // TODO: Keep track of undefs and use that info in the general case. + // If all result lanes but 1 are undefined, no need to splat the scalar + // result. if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) { // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) --> // build_vec ..undef, (bo X, Y), undef... From c41456e1c0e8ded167f197b62e148912c657eb23 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 16 Jan 2025 19:54:29 +0700 Subject: [PATCH 6/6] Fix comment again --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f4c04756e1305..58ab99e0dcdee 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -27526,9 +27526,11 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG, if ((Opcode == ISD::MULHS || Opcode == ISD::MULHU) && !TLI.isTypeLegal(EltVT)) return SDValue(); - // If all result lanes but 1 are undefined, no need to splat the scalar - // result. if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode()) { + // All but one element should have an undef input, which will fold to a + // constant or undef. Avoid splatting which would over-define potentially + // undefined elements. + // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) --> // build_vec ..undef, (bo X, Y), undef... SmallVector EltsX, EltsY, EltsResult;