7
7
# end
8
8
9
9
struct InstructionCost
10
- scalar_latency:: Int
11
- scalar_reciprical_throughput:: Float64
12
10
scaling:: Float64 # sentinel values: -3 == no scaling; -2 == offset_scaling, -1 == linear scaling, >0 -> == latency == reciprical throughput
11
+ scalar_reciprical_throughput:: Float64
12
+ scalar_latency:: Int
13
13
register_pressure:: Int
14
14
end
15
- InstructionCost (sl, srt, scaling = - 3.0 ) = InstructionCost (sl , srt, scaling, 1 )
15
+ InstructionCost (sl:: Int , srt:: Float64 , scaling:: Float64 = - 3.0 ) = InstructionCost (scaling , srt, sl, srt, 0 )
16
16
17
17
function scalar_cost (instruction:: InstructionCost )# , ::Type{T} = Float64) where {T}
18
- instruction. scalar_latency, instruction. scalar_reciprical_throughput
18
+ @unpack scalar_reciprical_throughput, scalar_latency, register_pressure = instruction
19
+ scalar_reciprical_throughput, scalar_latency, register_pressure
19
20
end
20
21
function vector_cost (instruction:: InstructionCost , Wshift, sizeof_T)
21
- sl, srt = scalar_cost (instruction)
22
+ srt, sl, srp = scalar_cost (instruction)
22
23
scaling = instruction. scaling
23
- if scaling == - 3.0 || Wshift == 0
24
- return sl, srt
25
- elseif scaling == - 2.0
24
+ if scaling == - 3.0 || Wshift == 0 # No scaling
25
+ return srt, sl, srp
26
+ elseif scaling == - 2.0 # offset scaling
26
27
srt *= 1 << (Wshift + VectorizationBase. intlog2 (sizeof_T) - 4 )
27
28
if (sizeof_T << Wshift) == 64 # VectorizationBase.REGISTER_SIZE # These instructions experience double latency with zmm
28
29
sl += sl
29
30
end
30
- elseif scaling == - 1.0
31
+ elseif scaling == - 1.0 # linear scaling
31
32
W = 1 << Wshift
32
33
extra_latency = sl - srt
33
34
srt *= W
34
35
sl = srt + extra_latency
35
- else
36
+ else # we assume custom cost, and that latency == recip_throughput
36
37
sl, srt = scaling, scaling
37
38
end
38
- sl, srt
39
+ srt, sl, srp
39
40
end
40
41
function cost (instruction:: InstructionCost , Wshift, sizeof_T)
41
42
Wshift == 0 ? scalar_cost (instruction) : vector_cost (instruction, Wshift, sizeof_T)
@@ -48,12 +49,19 @@ function cost(instruction::Symbol, Wshift, sizeof_T)
48
49
)
49
50
end
50
51
52
+
51
53
# Just a semi-reasonable assumption; should not be that sensitive to anything other than loads
52
54
const OPAQUE_INSTRUCTION = InstructionCost (50 , 50.0 , - 1.0 , VectorizationBase. REGISTER_COUNT)
53
55
56
+ # Comments on setindex!
57
+ # 1. Not a part of dependency chains, so not really twice as expensive as getindex?
58
+ # 2. getindex loads a register, not setindex!, but we place cost on setindex!
59
+ # as a heuristic means of approximating register pressure, since many loads can be
60
+ # consolidated into a single register. The number of LICM-ed setindex!, on the other
61
+ # hand, should indicate how many registers we're keeping live for the sake of eventually storing.
54
62
const COST = Dict {Symbol,InstructionCost} (
55
- :getindex => InstructionCost (3 ,0.5 ),
56
- :setindex! => InstructionCost (3 ,1.0 ), # but not a part of dependency chains, so not really twice as expensive?
63
+ :getindex => InstructionCost (3 ,0.5 , - 3.0 , 0 ),
64
+ :setindex! => InstructionCost (3 ,1.0 , - 3.0 , 1 ),
57
65
:(+ ) => InstructionCost (4 ,0.5 ),
58
66
:(- ) => InstructionCost (4 ,0.5 ),
59
67
:(* ) => InstructionCost (4 ,0.5 ),
@@ -66,7 +74,7 @@ const COST = Dict{Symbol,InstructionCost}(
66
74
:(< ) => InstructionCost (1 , 0.5 ),
67
75
:(>= ) => InstructionCost (1 , 0.5 ),
68
76
:(<= ) => InstructionCost (1 , 0.5 ),
69
- :inv => InstructionCost (13 ,4.0 ,- 2.0 ,2 ),
77
+ :inv => InstructionCost (13 ,4.0 ,- 2.0 ,1 ),
70
78
:muladd => InstructionCost (4 ,0.5 ), # + and * will fuse into this, so much of the time they're not twice as expensive
71
79
:fma => InstructionCost (4 ,0.5 ), # + and * will fuse into this, so much of the time they're not twice as expensive
72
80
:vmuladd => InstructionCost (4 ,0.5 ), # + and * will fuse into this, so much of the time they're not twice as expensive
@@ -76,12 +84,15 @@ const COST = Dict{Symbol,InstructionCost}(
76
84
:vfnmadd => InstructionCost (4 ,0.5 ), # + and -* will fuse into this, so much of the time they're not twice as expensive
77
85
:vfnmsub => InstructionCost (4 ,0.5 ), # - and -* will fuse into this, so much of the time they're not twice as expensive
78
86
:sqrt => InstructionCost (15 ,4.0 ,- 2.0 ),
79
- :log => InstructionCost (20 ,20.0 ,40.0 ,21 ),
80
- :exp => InstructionCost (20 ,20.0 ,20.0 ,19 ),
81
- :sin => InstructionCost (18 ,15.0 ,68.0 ,24 ),
82
- :cos => InstructionCost (18 ,15.0 ,68.0 ,27 ),
83
- :sincos => InstructionCost (25 ,22.0 ,70.0 ,27 )
87
+ :log => InstructionCost (20 ,20.0 ,40.0 ,20 ),
88
+ :exp => InstructionCost (20 ,20.0 ,20.0 ,18 ),
89
+ :sin => InstructionCost (18 ,15.0 ,68.0 ,23 ),
90
+ :cos => InstructionCost (18 ,15.0 ,68.0 ,26 ),
91
+ :sincos => InstructionCost (25 ,22.0 ,70.0 ,26 )
84
92
)
93
+ for (k, v) ∈ COST # so we can look up Symbol(typeof(function))
94
+ COST[Symbol (" typeof(" , k, " )" )] = v
95
+ end
85
96
86
97
87
98
# const SIMDPIRATES_COST = Dict{Symbol,InstructionCost}()
0 commit comments