Skip to content

Commit 7c273a1

Browse files
dfukalovmemfrob
authored andcommitted
[AMDGPU][CostModel] Add f16, f64 and contract cases to fused costs estimation.
Add cases of fused fmul+fadd/fsub with f16 and f64 operands to cost model. Also added operations with contract attribute. Fixed line endings in test. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D84995
1 parent ae042a2 commit 7c273a1

File tree

3 files changed

+188
-60
lines changed

3 files changed

+188
-60
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -510,11 +510,21 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
510510
// Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
511511
// fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
512512
// fused operation.
513-
if (!HasFP32Denormals && SLT == MVT::f32 && CxtI && CxtI->hasOneUse())
513+
if (CxtI && CxtI->hasOneUse())
514514
if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
515515
const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
516516
if (OPC == ISD::FADD || OPC == ISD::FSUB) {
517-
return TargetTransformInfo::TCC_Free;
517+
if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
518+
return TargetTransformInfo::TCC_Free;
519+
if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
520+
return TargetTransformInfo::TCC_Free;
521+
522+
// Estimate all types may be fused with contract/unsafe flags
523+
const TargetOptions &Options = TLI->getTargetMachine().Options;
524+
if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
525+
Options.UnsafeFPMath ||
526+
(FAdd->hasAllowContract() && CxtI->hasAllowContract()))
527+
return TargetTransformInfo::TCC_Free;
518528
}
519529
}
520530
LLVM_FALLTHROUGH;

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
7878
AMDGPUTTIImpl CommonTTI;
7979
bool IsGraphicsShader;
8080
bool HasFP32Denormals;
81+
bool HasFP64FP16Denormals;
8182
unsigned MaxVGPRs;
8283

8384
const FeatureBitset InlineFeatureIgnoreList = {
@@ -133,16 +134,18 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
133134

134135
public:
135136
explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
136-
: BaseT(TM, F.getParent()->getDataLayout()),
137-
ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
138-
TLI(ST->getTargetLowering()),
139-
CommonTTI(TM, F),
140-
IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
141-
HasFP32Denormals(AMDGPU::SIModeRegisterDefaults(F).allFP32Denormals()),
142-
MaxVGPRs(ST->getMaxNumVGPRs(
143-
std::max(ST->getWavesPerEU(F).first,
144-
ST->getWavesPerEUForWorkGroup(
145-
ST->getFlatWorkGroupSizes(F).second)))) {}
137+
: BaseT(TM, F.getParent()->getDataLayout()),
138+
ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
139+
TLI(ST->getTargetLowering()), CommonTTI(TM, F),
140+
IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
141+
MaxVGPRs(ST->getMaxNumVGPRs(
142+
std::max(ST->getWavesPerEU(F).first,
143+
ST->getWavesPerEUForWorkGroup(
144+
ST->getFlatWorkGroupSizes(F).second)))) {
145+
AMDGPU::SIModeRegisterDefaults Mode(F);
146+
HasFP32Denormals = Mode.allFP32Denormals();
147+
HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
148+
}
146149

147150
bool hasBranchDivergence() { return true; }
148151
bool useGPUDivergenceAnalysis() const;
Lines changed: 163 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,163 @@
1-
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=FUSED,ALL %s
2-
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=SLOW,ALL %s
3-
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=FUSED,ALL %s
4-
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=SLOW,ALL %s
5-
6-
target triple = "amdgcn--"
7-
8-
; ALL-LABEL: 'fmul_fadd_f32':
9-
; FUSED: estimated cost of 0 for instruction: %mul = fmul float
10-
; SLOW: estimated cost of 1 for instruction: %mul = fmul float
11-
; ALL: estimated cost of 1 for instruction: %add = fadd float
12-
define float @fmul_fadd_f32(float %r0, float %r1, float %r2) #0 {
13-
%mul = fmul float %r0, %r1
14-
%add = fadd float %mul, %r2
15-
ret float %add
16-
}
17-
18-
; ALL-LABEL: 'fmul_fadd_v2f32':
19-
; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float>
20-
; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float>
21-
; ALL: estimated cost of 2 for instruction: %add = fadd <2 x float>
22-
define <2 x float> @fmul_fadd_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 {
23-
%mul = fmul <2 x float> %r0, %r1
24-
%add = fadd <2 x float> %mul, %r2
25-
ret <2 x float> %add
26-
}
27-
28-
; ALL-LABEL: 'fmul_fsub_f32':
29-
; FUSED: estimated cost of 0 for instruction: %mul = fmul float
30-
; SLOW: estimated cost of 1 for instruction: %mul = fmul float
31-
; ALL: estimated cost of 1 for instruction: %sub = fsub float
32-
define float @fmul_fsub_f32(float %r0, float %r1, float %r2) #0 {
33-
%mul = fmul float %r0, %r1
34-
%sub = fsub float %mul, %r2
35-
ret float %sub
36-
}
37-
38-
; ALL-LABEL: 'fmul_fsub_v2f32':
39-
; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float>
40-
; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float>
41-
; ALL: estimated cost of 2 for instruction: %sub = fsub <2 x float>
42-
define <2 x float> @fmul_fsub_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 {
43-
%mul = fmul <2 x float> %r0, %r1
44-
%sub = fsub <2 x float> %mul, %r2
45-
ret <2 x float> %sub
46-
}
47-
48-
attributes #0 = { nounwind }
1+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,ALL %s
2+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s
3+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,ALL %s
4+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s
5+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED32,FUSED16,NOCONTRACT,ALL %s
6+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s
7+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED32,FUSED16,CONTRACT,ALL %s
8+
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s
9+
10+
target triple = "amdgcn--"
11+
12+
; ALL-LABEL: 'fmul_fadd_f32':
13+
; FUSED: estimated cost of 0 for instruction: %mul = fmul float
14+
; SLOW: estimated cost of 1 for instruction: %mul = fmul float
15+
; GFX1030: estimated cost of 1 for instruction: %mul = fmul float
16+
; ALL: estimated cost of 1 for instruction: %add = fadd float
17+
define float @fmul_fadd_f32(float %r0, float %r1, float %r2) #0 {
18+
%mul = fmul float %r0, %r1
19+
%add = fadd float %mul, %r2
20+
ret float %add
21+
}
22+
23+
; ALL-LABEL: 'fmul_fadd_contract_f32':
24+
; ALL: estimated cost of 0 for instruction: %mul = fmul contract float
25+
; ALL: estimated cost of 1 for instruction: %add = fadd contract float
26+
define float @fmul_fadd_contract_f32(float %r0, float %r1, float %r2) #0 {
27+
%mul = fmul contract float %r0, %r1
28+
%add = fadd contract float %mul, %r2
29+
ret float %add
30+
}
31+
32+
; ALL-LABEL: 'fmul_fadd_v2f32':
33+
; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float>
34+
; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float>
35+
; GFX1030: estimated cost of 2 for instruction: %mul = fmul <2 x float>
36+
; ALL: estimated cost of 2 for instruction: %add = fadd <2 x float>
37+
define <2 x float> @fmul_fadd_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 {
38+
%mul = fmul <2 x float> %r0, %r1
39+
%add = fadd <2 x float> %mul, %r2
40+
ret <2 x float> %add
41+
}
42+
43+
; ALL-LABEL: 'fmul_fsub_f32':
44+
; FUSED: estimated cost of 0 for instruction: %mul = fmul float
45+
; SLOW: estimated cost of 1 for instruction: %mul = fmul float
46+
; GFX1030: estimated cost of 1 for instruction: %mul = fmul float
47+
; ALL: estimated cost of 1 for instruction: %sub = fsub float
48+
define float @fmul_fsub_f32(float %r0, float %r1, float %r2) #0 {
49+
%mul = fmul float %r0, %r1
50+
%sub = fsub float %mul, %r2
51+
ret float %sub
52+
}
53+
54+
; ALL-LABEL: 'fmul_fsub_v2f32':
55+
; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float>
56+
; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float>
57+
; GFX1030: estimated cost of 2 for instruction: %mul = fmul <2 x float>
58+
; ALL: estimated cost of 2 for instruction: %sub = fsub <2 x float>
59+
define <2 x float> @fmul_fsub_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 {
60+
%mul = fmul <2 x float> %r0, %r1
61+
%sub = fsub <2 x float> %mul, %r2
62+
ret <2 x float> %sub
63+
}
64+
65+
; ALL-LABEL: 'fmul_fadd_f16':
66+
; FUSED: estimated cost of 0 for instruction: %mul = fmul half
67+
; SLOW: estimated cost of 1 for instruction: %mul = fmul half
68+
; ALL: estimated cost of 1 for instruction: %add = fadd half
69+
define half @fmul_fadd_f16(half %r0, half %r1, half %r2) #0 {
70+
%mul = fmul half %r0, %r1
71+
%add = fadd half %mul, %r2
72+
ret half %add
73+
}
74+
75+
; ALL-LABEL: 'fmul_fadd_contract_f16':
76+
; ALL: estimated cost of 0 for instruction: %mul = fmul contract half
77+
; ALL: estimated cost of 1 for instruction: %add = fadd contract half
78+
define half @fmul_fadd_contract_f16(half %r0, half %r1, half %r2) #0 {
79+
%mul = fmul contract half %r0, %r1
80+
%add = fadd contract half %mul, %r2
81+
ret half %add
82+
}
83+
84+
; ALL-LABEL: 'fmul_fadd_v2f16':
85+
; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x half>
86+
; SLOW: estimated cost of 1 for instruction: %mul = fmul <2 x half>
87+
; ALL: estimated cost of 1 for instruction: %add = fadd <2 x half>
88+
define <2 x half> @fmul_fadd_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) #0 {
89+
%mul = fmul <2 x half> %r0, %r1
90+
%add = fadd <2 x half> %mul, %r2
91+
ret <2 x half> %add
92+
}
93+
94+
; ALL-LABEL: 'fmul_fsub_f16':
95+
; FUSED: estimated cost of 0 for instruction: %mul = fmul half
96+
; SLOW: estimated cost of 1 for instruction: %mul = fmul half
97+
; ALL: estimated cost of 1 for instruction: %sub = fsub half
98+
define half @fmul_fsub_f16(half %r0, half %r1, half %r2) #0 {
99+
%mul = fmul half %r0, %r1
100+
%sub = fsub half %mul, %r2
101+
ret half %sub
102+
}
103+
104+
; ALL-LABEL: 'fmul_fsub_v2f16':
105+
; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x half>
106+
; SLOW: estimated cost of 1 for instruction: %mul = fmul <2 x half>
107+
; ALL: estimated cost of 1 for instruction: %sub = fsub <2 x half>
108+
define <2 x half> @fmul_fsub_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) #0 {
109+
%mul = fmul <2 x half> %r0, %r1
110+
%sub = fsub <2 x half> %mul, %r2
111+
ret <2 x half> %sub
112+
}
113+
114+
; ALL-LABEL: 'fmul_fadd_f64':
115+
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double
116+
; NOCONTRACT: estimated cost of 3 for instruction: %mul = fmul double
117+
; ALL: estimated cost of 3 for instruction: %add = fadd double
118+
define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 {
119+
%mul = fmul double %r0, %r1
120+
%add = fadd double %mul, %r2
121+
ret double %add
122+
}
123+
124+
; ALL-LABEL: 'fmul_fadd_contract_f64':
125+
; ALL: estimated cost of 0 for instruction: %mul = fmul contract double
126+
; ALL: estimated cost of 3 for instruction: %add = fadd contract double
127+
define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 {
128+
%mul = fmul contract double %r0, %r1
129+
%add = fadd contract double %mul, %r2
130+
ret double %add
131+
}
132+
133+
; ALL-LABEL: 'fmul_fadd_v2f64':
134+
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double>
135+
; NOCONTRACT: estimated cost of 6 for instruction: %mul = fmul <2 x double>
136+
; ALL: estimated cost of 6 for instruction: %add = fadd <2 x double>
137+
define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
138+
%mul = fmul <2 x double> %r0, %r1
139+
%add = fadd <2 x double> %mul, %r2
140+
ret <2 x double> %add
141+
}
142+
143+
; ALL-LABEL: 'fmul_fsub_f64':
144+
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double
145+
; NOCONTRACT: estimated cost of 3 for instruction: %mul = fmul double
146+
; ALL: estimated cost of 3 for instruction: %sub = fsub double
147+
define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 {
148+
%mul = fmul double %r0, %r1
149+
%sub = fsub double %mul, %r2
150+
ret double %sub
151+
}
152+
153+
; ALL-LABEL: 'fmul_fsub_v2f64':
154+
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double>
155+
; NOCONTRACT: estimated cost of 6 for instruction: %mul = fmul <2 x double>
156+
; ALL: estimated cost of 6 for instruction: %sub = fsub <2 x double>
157+
define <2 x double> @fmul_fsub_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
158+
%mul = fmul <2 x double> %r0, %r1
159+
%sub = fsub <2 x double> %mul, %r2
160+
ret <2 x double> %sub
161+
}
162+
163+
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)