From 647bfbdfca95340a6a665fc5f49372a9612dd21d Mon Sep 17 00:00:00 2001 From: Jerry Dang Date: Sun, 30 Nov 2025 15:04:04 -0500 Subject: [PATCH 1/4] [VectorCombine] Fold permute of intrinsics into intrinsic of permutes: shuffle(intrinsic, poison/undef) -> intrinsic(shuffle) Implements #170002 --- .../Transforms/Vectorize/VectorCombine.cpp | 90 +++++++++++ .../AArch64/shuffle-of-intrinsic-permute.ll | 148 ++++++++++++++++++ .../AArch64/shuffletoidentity.ll | 6 +- 3 files changed, 241 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Transforms/VectorCombine/AArch64/shuffle-of-intrinsic-permute.ll diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index f1890e4f5fb95..7540a954fb1c1 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -139,6 +139,7 @@ class VectorCombine { bool foldShuffleOfSelects(Instruction &I); bool foldShuffleOfCastops(Instruction &I); bool foldShuffleOfShuffles(Instruction &I); + bool foldPermuteOfIntrinsic(Instruction &I); bool foldShuffleOfIntrinsics(Instruction &I); bool foldShuffleToIdentity(Instruction &I); bool foldShuffleFromReductions(Instruction &I); @@ -2960,6 +2961,93 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) { return true; } +/// Try to convert +/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)". +bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) { + Value *V0, *V1; + ArrayRef Mask; + if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_Value(V1), m_Mask(Mask)))) + return false; + + // Check for permute + if (!match(V1, m_Poison()) && !match(V1, m_Undef())) { + LLVM_DEBUG(dbgs() << "not a permute\n"); + return false; + } + + auto *II0 = dyn_cast(V0); + if (!II0) + return false; + + auto *ShuffleDstTy = dyn_cast(I.getType()); + auto *IntrinsicSrcTy = dyn_cast(II0->getType()); + if (!ShuffleDstTy || !IntrinsicSrcTy) + return false; + + // Validate it's a pure permute, mask should only reference the first vector + unsigned NumSrcElts = IntrinsicSrcTy->getNumElements(); + for (int Idx : Mask) { + if (Idx > 0 && Idx >= (int)NumSrcElts) + return false; + } + + Intrinsic::ID IID = II0->getIntrinsicID(); + if (!isTriviallyVectorizable(IID)) + return false; + + // Cost analysis + InstructionCost OldCost = + TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy, + IntrinsicSrcTy, Mask, CostKind); + + SmallVector NewArgsTy; + InstructionCost NewCost = 0; + + for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) { + if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) { + NewArgsTy.push_back(II0->getArgOperand(I)->getType()); + } else { + auto *VecTy = cast(II0->getArgOperand(I)->getType()); + auto *ArgTy = FixedVectorType::get(VecTy->getElementType(), + ShuffleDstTy->getNumElements()); + NewArgsTy.push_back(ArgTy); + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + ArgTy, VecTy, Mask, CostKind); + } + } + IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy); + NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind); + + LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: " + << OldCost << " vs NewCost: " << NewCost << "\n"); + + if (NewCost > OldCost) + return false; + + // Transform + SmallVector NewArgs; + for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) { + if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) { + NewArgs.push_back(II0->getArgOperand(I)); + } else { + Value *Shuf = Builder.CreateShuffleVector( + II0->getArgOperand(I), + PoisonValue::get(II0->getArgOperand(I)->getType()), Mask); + NewArgs.push_back(Shuf); + Worklist.pushValue(Shuf); + } + } + + Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs); + + if (auto *NewInst = dyn_cast(NewIntrinsic)) + NewInst->copyIRFlags(II0); + + replaceValue(I, *NewIntrinsic); + return true; +} + using InstLane = std::pair; static InstLane lookThroughShuffles(Use *U, int Lane) { @@ -4718,6 +4806,8 @@ bool VectorCombine::run() { return true; if (foldShuffleOfShuffles(I)) return true; + if (foldPermuteOfIntrinsic(I)) + return true; if (foldShuffleOfIntrinsics(I)) return true; if (foldSelectShuffle(I)) diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffle-of-intrinsic-permute.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffle-of-intrinsic-permute.ll new file mode 100644 index 0000000000000..d6cac620bc28e --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffle-of-intrinsic-permute.ll @@ -0,0 +1,148 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=vector-combine -S -mtriple=aarch64 %s | FileCheck %s + +; This file tests the foldPermuteOfIntrinsic optimization which transforms: +; shuffle(intrinsic(args), poison/undef) -> intrinsic(shuffle(args)) +; when the shuffle is a permute (operates on single vector) and cost model +; determines the transformation is beneficial. + +;; ============================================================================ +;; Positive Tests - Should Optimize +;; ============================================================================ + +define <4 x i32> @extract_lower_sadd_sat(<8 x i32> %v1, <8 x i32> %v2) { +; CHECK-LABEL: @extract_lower_sadd_sat( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; CHECK-NEXT: ret <4 x i32> [[RESULT]] +; + %sat = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %v1, <8 x i32> %v2) + %result = shufflevector <8 x i32> %sat, <8 x i32> poison, <4 x i32> + ret <4 x i32> %result +} + +define <4 x i32> @extract_lower_uadd_sat(<8 x i32> %v1, <8 x i32> %v2) { +; CHECK-LABEL: @extract_lower_uadd_sat( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; CHECK-NEXT: ret <4 x i32> [[RESULT]] +; + %sat = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %v1, <8 x i32> %v2) + %result = shufflevector <8 x i32> %sat, <8 x i32> poison, <4 x i32> + ret <4 x i32> %result +} + +define <4 x float> @extract_lower_fma(<8 x float> %a, <8 x float> %b, <8 x float> %c) { +; CHECK-LABEL: @extract_lower_fma( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[C:%.*]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]]) +; CHECK-NEXT: ret <4 x float> [[RESULT]] +; + %fma = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) + %result = shufflevector <8 x float> %fma, <8 x float> poison, <4 x i32> + ret <4 x float> %result +} + +define <4 x i32> @extract_lower_abs_should_not_shuffle_scalar(<8 x i32> %v) { +; CHECK-LABEL: @extract_lower_abs_should_not_shuffle_scalar( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false) +; CHECK-NEXT: ret <4 x i32> [[RESULT]] +; + %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %v, i1 false) + %result = shufflevector <8 x i32> %abs, <8 x i32> poison, <4 x i32> + ret <4 x i32> %result +} + +define <2 x i64> @extract_lower_i64(<4 x i64> %v1, <4 x i64> %v2) { +; CHECK-LABEL: @extract_lower_i64( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[V1:%.*]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[V2:%.*]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]]) +; CHECK-NEXT: ret <2 x i64> [[RESULT]] +; + %sat = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %v1, <4 x i64> %v2) + %result = shufflevector <4 x i64> %sat, <4 x i64> poison, <2 x i32> + ret <2 x i64> %result +} + +define <8 x i16> @extract_lower_i16(<16 x i16> %v1, <16 x i16> %v2) { +; CHECK-LABEL: @extract_lower_i16( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[V1:%.*]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[V2:%.*]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; CHECK-NEXT: ret <8 x i16> [[RESULT]] +; + %sat = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %v1, <16 x i16> %v2) + %result = shufflevector <16 x i16> %sat, <16 x i16> poison, <8 x i32> + ret <8 x i16> %result +} + +define <4 x i32> @extract_lower_with_undef(<8 x i32> %v1, <8 x i32> %v2) { +; CHECK-LABEL: @extract_lower_with_undef( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; CHECK-NEXT: ret <4 x i32> [[RESULT]] +; + %sat = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %v1, <8 x i32> %v2) + %result = shufflevector <8 x i32> %sat, <8 x i32> undef, <4 x i32> + ret <4 x i32> %result +} + +;; ============================================================================ +;; Negative Tests - Should NOT Optimize +;; ============================================================================ + +define <4 x i32> @same_size_permute(<4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: @same_size_permute( +; CHECK-NEXT: [[SAT:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[V1:%.*]], <4 x i32> [[V2:%.*]]) +; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <4 x i32> [[SAT]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[RESULT]] +; + %sat = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %v1, <4 x i32> %v2) + %result = shufflevector <4 x i32> %sat, <4 x i32> poison, <4 x i32> + ret <4 x i32> %result +} + +define <4 x i32> @not_a_permute_uses_second_operand(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %other) { +; CHECK-LABEL: @not_a_permute_uses_second_operand( +; CHECK-NEXT: [[SAT:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[V1:%.*]], <4 x i32> [[V2:%.*]]) +; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <4 x i32> [[SAT]], <4 x i32> [[OTHER:%.*]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[RESULT]] +; + %sat = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %v1, <4 x i32> %v2) + %result = shufflevector <4 x i32> %sat, <4 x i32> %other, <4 x i32> + ret <4 x i32> %result +} + +define <4 x i32> @not_an_intrinsic(<8 x i32> %v1, <8 x i32> %v2) { +; CHECK-LABEL: @not_an_intrinsic( +; CHECK-NEXT: [[ADD:%.*]] = add <8 x i32> [[V1:%.*]], [[V2:%.*]] +; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[ADD]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[RESULT]] +; + %add = add <8 x i32> %v1, %v2 + %result = shufflevector <8 x i32> %add, <8 x i32> poison, <4 x i32> + ret <4 x i32> %result +} + +declare <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64>, <4 x i64>) +declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) +declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) +declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) + +declare <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) + +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1 immarg) + +declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index ed29719d49493..7ffd0d29b4f05 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -204,9 +204,9 @@ define <8 x i8> @abs_different(<8 x i8> %a) { define <4 x i32> @poison_intrinsic(<2 x i16> %l256) { ; CHECK-LABEL: @poison_intrinsic( -; CHECK-NEXT: [[L266:%.*]] = call <2 x i16> @llvm.abs.v2i16(<2 x i16> [[L256:%.*]], i1 false) -; CHECK-NEXT: [[L267:%.*]] = shufflevector <2 x i16> [[L266]], <2 x i16> poison, <4 x i32> -; CHECK-NEXT: [[L271:%.*]] = zext <4 x i16> [[L267]] to <4 x i32> +; CHECK-NEXT: [[L267:%.*]] = shufflevector <2 x i16> [[L266:%.*]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i16> @llvm.abs.v4i16(<4 x i16> [[L267]], i1 false) +; CHECK-NEXT: [[L271:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[L271]] ; %l266 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %l256, i1 false) From 6662e4549c162baa536d9f998b8befe49ec2937c Mon Sep 17 00:00:00 2001 From: Jerry Dang Date: Wed, 3 Dec 2025 11:27:09 -0500 Subject: [PATCH 2/4] Include shuffle arguments to improve cost analysis; remove undef tests --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 7 ++++--- .../AArch64/shuffle-of-intrinsic-permute.ll | 14 +------------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 7540a954fb1c1..3b5600c2d4885 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -2999,11 +2999,11 @@ bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) { InstructionCost OldCost = TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy, - IntrinsicSrcTy, Mask, CostKind); + IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0, V1}, + &I); SmallVector NewArgsTy; InstructionCost NewCost = 0; - for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) { if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) { NewArgsTy.push_back(II0->getArgOperand(I)->getType()); @@ -3013,7 +3013,8 @@ bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) { ShuffleDstTy->getNumElements()); NewArgsTy.push_back(ArgTy); NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - ArgTy, VecTy, Mask, CostKind); + ArgTy, VecTy, Mask, CostKind, 0, nullptr, + {II0->getArgOperand(I)}); } } IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy); diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffle-of-intrinsic-permute.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffle-of-intrinsic-permute.ll index d6cac620bc28e..39a0357b56286 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffle-of-intrinsic-permute.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffle-of-intrinsic-permute.ll @@ -2,7 +2,7 @@ ; RUN: opt -passes=vector-combine -S -mtriple=aarch64 %s | FileCheck %s ; This file tests the foldPermuteOfIntrinsic optimization which transforms: -; shuffle(intrinsic(args), poison/undef) -> intrinsic(shuffle(args)) +; shuffle(intrinsic(args), poison) -> intrinsic(shuffle(args)) ; when the shuffle is a permute (operates on single vector) and cost model ; determines the transformation is beneficial. @@ -82,18 +82,6 @@ define <8 x i16> @extract_lower_i16(<16 x i16> %v1, <16 x i16> %v2) { ret <8 x i16> %result } -define <4 x i32> @extract_lower_with_undef(<8 x i32> %v1, <8 x i32> %v2) { -; CHECK-LABEL: @extract_lower_with_undef( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; CHECK-NEXT: ret <4 x i32> [[RESULT]] -; - %sat = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %v1, <8 x i32> %v2) - %result = shufflevector <8 x i32> %sat, <8 x i32> undef, <4 x i32> - ret <4 x i32> %result -} - ;; ============================================================================ ;; Negative Tests - Should NOT Optimize ;; ============================================================================ From 94b313d10a660fa4b45119a9a63828c8818b24ac Mon Sep 17 00:00:00 2001 From: Jerry Dang Date: Wed, 3 Dec 2025 18:47:53 -0500 Subject: [PATCH 3/4] regenerate test for shuffle-of-fma --- .../VectorCombine/X86/shuffle-of-fma-const.ll | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll index b05f851a846f4..ff810b615bac9 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll @@ -3,11 +3,17 @@ ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX define <4 x float> @shuffle_fma_const_chain(<4 x float> %a0) { -; CHECK-LABEL: define <4 x float> @shuffle_fma_const_chain( -; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[F:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000)) -; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[RES]] +; SSE-LABEL: define <4 x float> @shuffle_fma_const_chain( +; SSE-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0:[0-9]+]] { +; SSE-NEXT: [[F:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000)) +; SSE-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <4 x i32> +; SSE-NEXT: ret <4 x float> [[RES]] +; +; AVX-LABEL: define <4 x float> @shuffle_fma_const_chain( +; AVX-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0:[0-9]+]] { +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[RES:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000)) +; AVX-NEXT: ret <4 x float> [[RES]] ; %f = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000)) %res = shufflevector <4 x float> %f, <4 x float> poison, <4 x i32> @@ -16,7 +22,7 @@ define <4 x float> @shuffle_fma_const_chain(<4 x float> %a0) { define <8 x float> @concat_fma_const_chain(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: define <8 x float> @concat_fma_const_chain( -; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> splat (float 0x3F8DE8D040000000), <8 x float> splat (float 0xBFB3715EE0000000)) ; CHECK-NEXT: ret <8 x float> [[RES]] From df07ed44be98428e3153e1f7dc8bcc185879cd5a Mon Sep 17 00:00:00 2001 From: Jerry Dang Date: Thu, 4 Dec 2025 12:11:08 -0500 Subject: [PATCH 4/4] Address refactor comments --- .../Transforms/Vectorize/VectorCombine.cpp | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 4f8e3687193e5..243f685cf25e2 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -2965,16 +2965,10 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) { /// Try to convert /// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)". bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) { - Value *V0, *V1; + Value *V0; ArrayRef Mask; - if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_Value(V1), m_Mask(Mask)))) - return false; - - // Check for permute - if (!match(V1, m_Poison()) && !match(V1, m_Undef())) { - LLVM_DEBUG(dbgs() << "not a permute\n"); + if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_Undef(), m_Mask(Mask)))) return false; - } auto *II0 = dyn_cast(V0); if (!II0) @@ -2987,10 +2981,8 @@ bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) { // Validate it's a pure permute, mask should only reference the first vector unsigned NumSrcElts = IntrinsicSrcTy->getNumElements(); - for (int Idx : Mask) { - if (Idx > 0 && Idx >= (int)NumSrcElts) - return false; - } + if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; })) + return false; Intrinsic::ID IID = II0->getIntrinsicID(); if (!isTriviallyVectorizable(IID)) @@ -3000,8 +2992,7 @@ bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) { InstructionCost OldCost = TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy, - IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0, V1}, - &I); + IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I); SmallVector NewArgsTy; InstructionCost NewCost = 0; @@ -3033,9 +3024,7 @@ bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) { if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) { NewArgs.push_back(II0->getArgOperand(I)); } else { - Value *Shuf = Builder.CreateShuffleVector( - II0->getArgOperand(I), - PoisonValue::get(II0->getArgOperand(I)->getType()), Mask); + Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask); NewArgs.push_back(Shuf); Worklist.pushValue(Shuf); }