Skip to content

Commit eab11c8

Browse files
committed
[Analysis][AArch64] Add cost model for loop.dependence.{war/raw}.mask
This PR adds the cost model for the loop dependence mask intrinsics, both for cases where they must be expanded and when they can be lowered for AArch64.
1 parent f1b5504 commit eab11c8

File tree

3 files changed

+185
-0
lines changed

3 files changed

+185
-0
lines changed

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2190,6 +2190,53 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
21902190
// Otherwise, fallback to default scalarization cost.
21912191
break;
21922192
}
2193+
case Intrinsic::loop_dependence_raw_mask:
2194+
case Intrinsic::loop_dependence_war_mask: {
2195+
InstructionCost Cost = 0;
2196+
Type *PtrTy = ICA.getArgTypes()[0];
2197+
bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;
2198+
2199+
Cost +=
2200+
thisT()->getArithmeticInstrCost(Instruction::Sub, PtrTy, CostKind);
2201+
if (IsReadAfterWrite) {
2202+
IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, PtrTy, {PtrTy}, {});
2203+
Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind);
2204+
}
2205+
2206+
Cost +=
2207+
thisT()->getArithmeticInstrCost(Instruction::SDiv, PtrTy, CostKind);
2208+
Type *CmpTy =
2209+
getTLI()
2210+
->getSetCCResultType(
2211+
thisT()->getDataLayout(), RetTy->getContext(),
2212+
getTLI()->getValueType(thisT()->getDataLayout(), PtrTy))
2213+
.getTypeForEVT(RetTy->getContext());
2214+
Cost += thisT()->getCmpSelInstrCost(
2215+
BinaryOperator::ICmp, CmpTy, PtrTy,
2216+
IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE, CostKind);
2217+
2218+
// The deconstructed active lane mask
2219+
VectorType *RetTyVec = cast<VectorType>(RetTy);
2220+
VectorType *SplatTy = cast<VectorType>(RetTyVec->getWithNewType(PtrTy));
2221+
Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SplatTy, SplatTy, {},
2222+
CostKind, 0, nullptr);
2223+
IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, SplatTy, {},
2224+
FMF);
2225+
Cost += thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
2226+
Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SplatTy,
2227+
SplatTy, CmpInst::ICMP_ULT, CostKind);
2228+
2229+
Cost +=
2230+
thisT()->getCastInstrCost(Instruction::CastOps::ZExt, RetTy, SplatTy,
2231+
TTI::CastContextHint::None, CostKind);
2232+
Cost += thisT()->getCastInstrCost(Instruction::CastOps::ZExt,
2233+
RetTyVec->getElementType(), CmpTy,
2234+
TTI::CastContextHint::None, CostKind);
2235+
Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, RetTyVec, RetTyVec, {},
2236+
CostKind, 0, nullptr);
2237+
Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2238+
return Cost;
2239+
}
21932240
}
21942241

21952242
// Assume that we need to scalarize this intrinsic.)

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,6 +1032,40 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
10321032
}
10331033
break;
10341034
}
1035+
case Intrinsic::loop_dependence_raw_mask:
1036+
case Intrinsic::loop_dependence_war_mask: {
1037+
auto *EltSize = cast<ConstantInt>(ICA.getArgs()[2]);
1038+
EVT VecVT = getTLI()->getValueType(DL, RetTy);
1039+
// An invalid element size and return type combination must be expanded.
1040+
bool MustBeExpanded = false;
1041+
switch (EltSize->getSExtValue()) {
1042+
case 1:
1043+
if (VecVT != MVT::v16i1 && VecVT != MVT::nxv16i1)
1044+
MustBeExpanded = true;
1045+
break;
1046+
case 2:
1047+
if (VecVT != MVT::v8i1 && VecVT != MVT::nxv8i1)
1048+
MustBeExpanded = true;
1049+
break;
1050+
case 4:
1051+
if (VecVT != MVT::v4i1 && VecVT != MVT::nxv4i1)
1052+
MustBeExpanded = true;
1053+
break;
1054+
case 8:
1055+
if (VecVT != MVT::v2i1 && VecVT != MVT::nxv2i1)
1056+
MustBeExpanded = true;
1057+
break;
1058+
default:
1059+
MustBeExpanded = true;
1060+
// Other element sizes are incompatible with whilewr/rw, so expand instead
1061+
break;
1062+
}
1063+
1064+
// The whilewr/rw instructions require SVE2 or SME
1065+
if (MustBeExpanded || (!ST->hasSVE2() && !ST->hasSME()))
1066+
break;
1067+
return 1;
1068+
}
10351069
case Intrinsic::experimental_vector_extract_last_active:
10361070
if (ST->isSVEorStreamingSVEAvailable()) {
10371071
auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
2+
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-EXPANDED
3+
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s --check-prefix=CHECK
4+
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sme | FileCheck %s --check-prefix=CHECK
5+
6+
; loop.dependence.{war,raw}.mask can be lowered to while{wr,rw} if SVE2 or SME is enabled.
7+
define void @loop_dependence_war_mask(ptr %a, ptr %b) {
8+
; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask'
9+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
10+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
11+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
12+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
13+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
14+
;
15+
; CHECK-LABEL: 'loop_dependence_war_mask'
16+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
17+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
19+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
20+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
21+
;
22+
entry:
23+
%res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
24+
%res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
25+
%res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
26+
%res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
27+
ret void
28+
}
29+
30+
define void @loop_dependence_raw_mask(ptr %a, ptr %b) {
31+
; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask'
32+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
33+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
34+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
35+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
36+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
37+
;
38+
; CHECK-LABEL: 'loop_dependence_raw_mask'
39+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
40+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
41+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
42+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
43+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
44+
;
45+
entry:
46+
%res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
47+
%res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
48+
%res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
49+
%res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
50+
ret void
51+
}
52+
53+
; Invalid element size and return type combinations must be expanded, even with sve2/sme
54+
define void @loop_dependence_war_mask_invalid(ptr %a, ptr %b) {
55+
; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask_invalid'
56+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
57+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
58+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
59+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
60+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
61+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
62+
;
63+
; CHECK-LABEL: 'loop_dependence_war_mask_invalid'
64+
; CHECK-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
65+
; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
66+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
67+
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
68+
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
69+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
70+
;
71+
entry:
72+
%res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
73+
%res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
74+
%res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
75+
%res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
76+
%res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
77+
ret void
78+
}
79+
80+
define void @loop_dependence_raw_mask_invalid(ptr %a, ptr %b) {
81+
; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask_invalid'
82+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
83+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
84+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
85+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
86+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
87+
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
88+
;
89+
; CHECK-LABEL: 'loop_dependence_raw_mask_invalid'
90+
; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
91+
; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
92+
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
93+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
94+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
95+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
96+
;
97+
entry:
98+
%res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
99+
%res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
100+
%res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
101+
%res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
102+
%res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
103+
ret void
104+
}

0 commit comments

Comments
 (0)