Skip to content

Commit 23f09fd

Browse files
[VectorCombine] Fold permute of intrinsics into intrinsic of permutes: shuffle(intrinsic, poison/undef) -> intrinsic(shuffle) (#170052)
[VectorCombine] Fold permute of intrinsics into intrinsic of permutes Add foldPermuteOfIntrinsic to transform: shuffle(intrinsic(args), poison) -> intrinsic(shuffle(args)) when the shuffle is a permute (operates on single vector) and the cost model determines the transformation is profitable. This optimization is particularly beneficial for subvector extractions where we can avoid computing unused elements. For example: %fma = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, %b, %c) %result = shufflevector <8 x float> %fma, poison, <4 x i32> <0,1,2,3> transforms to: %a_low = shufflevector <8 x float> %a, poison, <4 x i32> <0,1,2,3> %b_low = shufflevector <8 x float> %b, poison, <4 x i32> <0,1,2,3> %c_low = shufflevector <8 x float> %c, poison, <4 x i32> <0,1,2,3> %result = call <4 x float> @llvm.fma.v4f32(%a_low, %b_low, %c_low) The transformation creates one shuffle per vector argument and calls the intrinsic with smaller vector types, reducing computation when only a subset of elements is needed. The existing foldShuffleOfIntrinsics handled the blend case (two intrinsic inputs), this adds support for the permute case (single intrinsic input). Fixes #170002
1 parent 716fffe commit 23f09fd

File tree

4 files changed

+231
-9
lines changed

4 files changed

+231
-9
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ class VectorCombine {
139139
bool foldShuffleOfSelects(Instruction &I);
140140
bool foldShuffleOfCastops(Instruction &I);
141141
bool foldShuffleOfShuffles(Instruction &I);
142+
bool foldPermuteOfIntrinsic(Instruction &I);
142143
bool foldShuffleOfIntrinsics(Instruction &I);
143144
bool foldShuffleToIdentity(Instruction &I);
144145
bool foldShuffleFromReductions(Instruction &I);
@@ -2961,6 +2962,83 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
29612962
return true;
29622963
}
29632964

2965+
/// Try to convert
2966+
/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
2967+
bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
2968+
Value *V0;
2969+
ArrayRef<int> Mask;
2970+
if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_Undef(), m_Mask(Mask))))
2971+
return false;
2972+
2973+
auto *II0 = dyn_cast<IntrinsicInst>(V0);
2974+
if (!II0)
2975+
return false;
2976+
2977+
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2978+
auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
2979+
if (!ShuffleDstTy || !IntrinsicSrcTy)
2980+
return false;
2981+
2982+
// Validate it's a pure permute, mask should only reference the first vector
2983+
unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
2984+
if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2985+
return false;
2986+
2987+
Intrinsic::ID IID = II0->getIntrinsicID();
2988+
if (!isTriviallyVectorizable(IID))
2989+
return false;
2990+
2991+
// Cost analysis
2992+
InstructionCost OldCost =
2993+
TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
2994+
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy,
2995+
IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
2996+
2997+
SmallVector<Type *> NewArgsTy;
2998+
InstructionCost NewCost = 0;
2999+
for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3000+
if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) {
3001+
NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3002+
} else {
3003+
auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3004+
auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3005+
ShuffleDstTy->getNumElements());
3006+
NewArgsTy.push_back(ArgTy);
3007+
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
3008+
ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3009+
{II0->getArgOperand(I)});
3010+
}
3011+
}
3012+
IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3013+
NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3014+
3015+
LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3016+
<< OldCost << " vs NewCost: " << NewCost << "\n");
3017+
3018+
if (NewCost > OldCost)
3019+
return false;
3020+
3021+
// Transform
3022+
SmallVector<Value *> NewArgs;
3023+
for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3024+
if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) {
3025+
NewArgs.push_back(II0->getArgOperand(I));
3026+
} else {
3027+
Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3028+
NewArgs.push_back(Shuf);
3029+
Worklist.pushValue(Shuf);
3030+
}
3031+
}
3032+
3033+
Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3034+
3035+
if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3036+
NewInst->copyIRFlags(II0);
3037+
3038+
replaceValue(I, *NewIntrinsic);
3039+
return true;
3040+
}
3041+
29643042
using InstLane = std::pair<Use *, int>;
29653043

29663044
static InstLane lookThroughShuffles(Use *U, int Lane) {
@@ -4719,6 +4797,8 @@ bool VectorCombine::run() {
47194797
return true;
47204798
if (foldShuffleOfShuffles(I))
47214799
return true;
4800+
if (foldPermuteOfIntrinsic(I))
4801+
return true;
47224802
if (foldShuffleOfIntrinsics(I))
47234803
return true;
47244804
if (foldSelectShuffle(I))
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -passes=vector-combine -S -mtriple=aarch64 %s | FileCheck %s
3+
4+
; This file tests the foldPermuteOfIntrinsic optimization which transforms:
5+
; shuffle(intrinsic(args), poison) -> intrinsic(shuffle(args))
6+
; when the shuffle is a permute (operates on single vector) and cost model
7+
; determines the transformation is beneficial.
8+
9+
;; ============================================================================
10+
;; Positive Tests - Should Optimize
11+
;; ============================================================================
12+
13+
define <4 x i32> @extract_lower_sadd_sat(<8 x i32> %v1, <8 x i32> %v2) {
14+
; CHECK-LABEL: @extract_lower_sadd_sat(
15+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
16+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
17+
; CHECK-NEXT: [[RESULT:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
18+
; CHECK-NEXT: ret <4 x i32> [[RESULT]]
19+
;
20+
%sat = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %v1, <8 x i32> %v2)
21+
%result = shufflevector <8 x i32> %sat, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
22+
ret <4 x i32> %result
23+
}
24+
25+
define <4 x i32> @extract_lower_uadd_sat(<8 x i32> %v1, <8 x i32> %v2) {
26+
; CHECK-LABEL: @extract_lower_uadd_sat(
27+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V1:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
28+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
29+
; CHECK-NEXT: [[RESULT:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
30+
; CHECK-NEXT: ret <4 x i32> [[RESULT]]
31+
;
32+
%sat = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %v1, <8 x i32> %v2)
33+
%result = shufflevector <8 x i32> %sat, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
34+
ret <4 x i32> %result
35+
}
36+
37+
define <4 x float> @extract_lower_fma(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
38+
; CHECK-LABEL: @extract_lower_fma(
39+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
40+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
41+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[C:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
42+
; CHECK-NEXT: [[RESULT:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]])
43+
; CHECK-NEXT: ret <4 x float> [[RESULT]]
44+
;
45+
%fma = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c)
46+
%result = shufflevector <8 x float> %fma, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
47+
ret <4 x float> %result
48+
}
49+
50+
define <4 x i32> @extract_lower_abs_should_not_shuffle_scalar(<8 x i32> %v) {
51+
; CHECK-LABEL: @extract_lower_abs_should_not_shuffle_scalar(
52+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
53+
; CHECK-NEXT: [[RESULT:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
54+
; CHECK-NEXT: ret <4 x i32> [[RESULT]]
55+
;
56+
%abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %v, i1 false)
57+
%result = shufflevector <8 x i32> %abs, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
58+
ret <4 x i32> %result
59+
}
60+
61+
define <2 x i64> @extract_lower_i64(<4 x i64> %v1, <4 x i64> %v2) {
62+
; CHECK-LABEL: @extract_lower_i64(
63+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[V1:%.*]], <4 x i64> poison, <2 x i32> <i32 0, i32 1>
64+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[V2:%.*]], <4 x i64> poison, <2 x i32> <i32 0, i32 1>
65+
; CHECK-NEXT: [[RESULT:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
66+
; CHECK-NEXT: ret <2 x i64> [[RESULT]]
67+
;
68+
%sat = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %v1, <4 x i64> %v2)
69+
%result = shufflevector <4 x i64> %sat, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
70+
ret <2 x i64> %result
71+
}
72+
73+
define <8 x i16> @extract_lower_i16(<16 x i16> %v1, <16 x i16> %v2) {
74+
; CHECK-LABEL: @extract_lower_i16(
75+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[V1:%.*]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
76+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[V2:%.*]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
77+
; CHECK-NEXT: [[RESULT:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
78+
; CHECK-NEXT: ret <8 x i16> [[RESULT]]
79+
;
80+
%sat = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %v1, <16 x i16> %v2)
81+
%result = shufflevector <16 x i16> %sat, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
82+
ret <8 x i16> %result
83+
}
84+
85+
;; ============================================================================
86+
;; Negative Tests - Should NOT Optimize
87+
;; ============================================================================
88+
89+
define <4 x i32> @same_size_permute(<4 x i32> %v1, <4 x i32> %v2) {
90+
; CHECK-LABEL: @same_size_permute(
91+
; CHECK-NEXT: [[SAT:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[V1:%.*]], <4 x i32> [[V2:%.*]])
92+
; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <4 x i32> [[SAT]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
93+
; CHECK-NEXT: ret <4 x i32> [[RESULT]]
94+
;
95+
%sat = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %v1, <4 x i32> %v2)
96+
%result = shufflevector <4 x i32> %sat, <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
97+
ret <4 x i32> %result
98+
}
99+
100+
define <4 x i32> @not_a_permute_uses_second_operand(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %other) {
101+
; CHECK-LABEL: @not_a_permute_uses_second_operand(
102+
; CHECK-NEXT: [[SAT:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[V1:%.*]], <4 x i32> [[V2:%.*]])
103+
; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <4 x i32> [[SAT]], <4 x i32> [[OTHER:%.*]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
104+
; CHECK-NEXT: ret <4 x i32> [[RESULT]]
105+
;
106+
%sat = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %v1, <4 x i32> %v2)
107+
%result = shufflevector <4 x i32> %sat, <4 x i32> %other, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
108+
ret <4 x i32> %result
109+
}
110+
111+
define <4 x i32> @not_an_intrinsic(<8 x i32> %v1, <8 x i32> %v2) {
112+
; CHECK-LABEL: @not_an_intrinsic(
113+
; CHECK-NEXT: [[ADD:%.*]] = add <8 x i32> [[V1:%.*]], [[V2:%.*]]
114+
; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[ADD]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
115+
; CHECK-NEXT: ret <4 x i32> [[RESULT]]
116+
;
117+
%add = add <8 x i32> %v1, %v2
118+
%result = shufflevector <8 x i32> %add, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
119+
ret <4 x i32> %result
120+
}
121+
122+
declare <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32>, <8 x i32>)
123+
declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
124+
declare <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64>, <4 x i64>)
125+
declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>)
126+
declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>)
127+
declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>)
128+
129+
declare <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32>, <8 x i32>)
130+
declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>)
131+
132+
declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg)
133+
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1 immarg)
134+
135+
declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
136+
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)

llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,9 @@ define <8 x i8> @abs_different(<8 x i8> %a) {
204204

205205
define <4 x i32> @poison_intrinsic(<2 x i16> %l256) {
206206
; CHECK-LABEL: @poison_intrinsic(
207-
; CHECK-NEXT: [[L266:%.*]] = call <2 x i16> @llvm.abs.v2i16(<2 x i16> [[L256:%.*]], i1 false)
208-
; CHECK-NEXT: [[L267:%.*]] = shufflevector <2 x i16> [[L266]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
209-
; CHECK-NEXT: [[L271:%.*]] = zext <4 x i16> [[L267]] to <4 x i32>
207+
; CHECK-NEXT: [[L267:%.*]] = shufflevector <2 x i16> [[L266:%.*]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
208+
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i16> @llvm.abs.v4i16(<4 x i16> [[L267]], i1 false)
209+
; CHECK-NEXT: [[L271:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
210210
; CHECK-NEXT: ret <4 x i32> [[L271]]
211211
;
212212
%l266 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %l256, i1 false)

llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,17 @@
33
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
44

55
define <4 x float> @shuffle_fma_const_chain(<4 x float> %a0) {
6-
; CHECK-LABEL: define <4 x float> @shuffle_fma_const_chain(
7-
; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0:[0-9]+]] {
8-
; CHECK-NEXT: [[F:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
9-
; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
10-
; CHECK-NEXT: ret <4 x float> [[RES]]
6+
; SSE-LABEL: define <4 x float> @shuffle_fma_const_chain(
7+
; SSE-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0:[0-9]+]] {
8+
; SSE-NEXT: [[F:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
9+
; SSE-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
10+
; SSE-NEXT: ret <4 x float> [[RES]]
11+
;
12+
; AVX-LABEL: define <4 x float> @shuffle_fma_const_chain(
13+
; AVX-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0:[0-9]+]] {
14+
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
15+
; AVX-NEXT: [[RES:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
16+
; AVX-NEXT: ret <4 x float> [[RES]]
1117
;
1218
%f = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
1319
%res = shufflevector <4 x float> %f, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -16,7 +22,7 @@ define <4 x float> @shuffle_fma_const_chain(<4 x float> %a0) {
1622

1723
define <8 x float> @concat_fma_const_chain(<4 x float> %a0, <4 x float> %a1) {
1824
; CHECK-LABEL: define <8 x float> @concat_fma_const_chain(
19-
; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
25+
; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0:[0-9]+]] {
2026
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2127
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> splat (float 0x3F8DE8D040000000), <8 x float> splat (float 0xBFB3715EE0000000))
2228
; CHECK-NEXT: ret <8 x float> [[RES]]

0 commit comments

Comments
 (0)