Skip to content

Commit 4cf6786

Browse files
committed
Working on computing costs of different loop orderings and unrolling strategies.
1 parent 6bf259c commit 4cf6786

File tree

3 files changed

+183
-2
lines changed

3 files changed

+183
-2
lines changed

src/costs.jl

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
2+
3+
# @enum CostScaling begin
4+
# NoCost
5+
# Linear
6+
# Unique
7+
# end
8+
9+
struct InstructionCost
10+
scalar_latency::Int
11+
scalar_reciprical_throughput::Float64
12+
scaling::Float64 # sentinel values: -2 == no scaling; -1 == scaling, >0 -> == latency == reciprical throughput
13+
14+
end
15+
InstructionCost(sl, srt) = InstructionCost(sl, srt, NoCost)
16+
17+
function scalar_cost(instruction::InstructionCost)#, ::Type{T} = Float64) where {T}
18+
instruction.scalar_latency, instruction.scalar_reciprical_throughput
19+
end
20+
function vector_cost(instruction::InstructionCost, Wshift, ::Type{T} = Float64) where {T}
21+
sl, srt = scalar_cost(instruction)
22+
scaling = instruction.scaling
23+
if scaling == NoCost || Wshift == 0
24+
returnsl, srt
25+
elseif scaling == Linear
26+
srt *= 1 << (Wshift + VectorizationBase.intlog2(sizeof(T)) - 4)
27+
if (sizeof(T) << Wshift) == VectorizationBase.REGISTER_SIZE # These instructions experience double latency with zmm
28+
sl += sl
29+
end
30+
end
31+
32+
sl, srt
33+
end
34+
function cost(instruction::InstructionCost, Wshift, ::Type{T}) where {T}
35+
Wshift == 0 ? scalar_cost(instruction) : vector_cost(instruction, Wshift, T)
36+
end
37+
38+
const COST = Dict{Symbol,InstructionCost}(
39+
:getindex => InstructionCost(3,0.5),
40+
:setindex! => InstructionCost(3,1.0), # but not a part of dependency chains, so not really twice as expensive?
41+
:+ => InstructionCost(4,0.5),
42+
:- => InstructionCost(4,0.5),
43+
:* => InstructionCost(4,0.5),
44+
:/ => InstructionCost(13,4.0,),
45+
:muladd => InstructionCost(0.5,4), # + and * will fuse into this, so much of the time they're not twice as expensive
46+
:sqrt => InstructionCost(),
47+
:log => InstructionCost(,,52.5),
48+
:exp => InstructionCost(,,30.0),
49+
:sin => InstructionCost(),
50+
:cos => InstructionCost(),
51+
:sincos => InstructionCost(),
52+
:
53+
)
54+
55+
# const SIMDPIRATES_COST = Dict{Symbol,InstructionCost}()
56+
# const SLEEFPIRATES_COST = Dict{Symbol,InstructionCost}()
57+
58+
# const MODULE_LOOKUP = Dict{Symbol,Dict{Symbol,InstructionCost}}(
59+
# :Base => BASE_COST,
60+
# :SIMDPirates => SIMDPIRATES_COST,
61+
# :SLEEFPirates => SLEEFPIRATES_COST
62+
# )
63+

src/graphs.jl

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,86 @@
1-
using LightGraphs
1+
# using LightGraphs
22

3+
struct ShortIntVector
4+
data::Vector{Int}
5+
end
6+
function Base.hash(x::ShortIntVector, h::UInt)
7+
d = x.data
8+
@inbounds for n eachindex(d)
9+
h = hash(d[n], h)
10+
end
11+
h
12+
end
313

414

15+
@enum NodeType begin
16+
input
17+
store
18+
reduction
19+
end
20+
21+
struct Node
22+
type::DataType
23+
end
524

25+
# Must make it easy to iterate
626
struct LoopSet
27+
28+
end
29+
30+
# evaluates cost of evaluating loop in given order
31+
function evaluate_cost(
32+
ls::LoopSet, order::ShortIntVector
33+
)
34+
35+
end
736

37+
# Here, we have to figure out how to convert the loopset into a vectorized expression.
38+
# This must traverse in a parent -> child pattern
39+
# but order is also dependent on which loop inds they depend on.
40+
# Requires sorting
41+
function lower(ls::LoopSet)
42+
43+
end
44+
45+
function Base.convert(::Type{Expr}, ls::LoopSet)
46+
lower(ls)
847
end
9-
1048

1149

1250

1351

52+
using BenchmarkTools, LoopVectorization, SLEEF
53+
θ = randn(1000); c = randn(1000);
54+
function sumsc_vectorized::AbstractArray{Float64}, coef::AbstractArray{Float64})
55+
s, c = 0.0, 0.0
56+
@vvectorize for i eachindex(θ, coef)
57+
sinθᵢ, cosθᵢ = sincos(θ[i])
58+
s += coef[i] * sinθᵢ
59+
c += coef[i] * cosθᵢ
60+
end
61+
s, c
62+
end
63+
function sumsc_serial::AbstractArray{Float64}, coef::AbstractArray{Float64})
64+
s, c = 0.0, 0.0
65+
@inbounds for i eachindex(θ, coef)
66+
sinθᵢ, cosθᵢ = sincos(θ[i])
67+
s += coef[i] * sinθᵢ
68+
c += coef[i] * cosθᵢ
69+
end
70+
s, c
71+
end
72+
function sumsc_sleef::AbstractArray{Float64}, coef::AbstractArray{Float64})
73+
s, c = 0.0, 0.0
74+
@inbounds @simd for i eachindex(θ, coef)
75+
sinθᵢ, cosθᵢ = SLEEF.sincos_fast(θ[i])
76+
s += coef[i] * sinθᵢ
77+
c += coef[i] * cosθᵢ
78+
end
79+
s, c
80+
end
81+
82+
@btime sumsc_serial($θ, $c)
83+
@btime sumsc_sleef($θ, $c)
84+
@btime sumsc_vectorized($θ, $c)
85+
1486

test/runtests.jl

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,52 @@
11
using LoopVectorization
22
using Test
33

4+
using CpuId, VectorizationBase, SIMDPirates, SLEEFPirates
5+
@generated function estimate_cost(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
6+
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
7+
quote
8+
Base.Cartesian.@nexprs $U u -> s_u = vbroadcast(Vec{$W,$T}, zero(T))
9+
# s = vbroadcast(V, zero(T))
10+
x = rand(T, N << $Wshift)
11+
ptrx = pointer(x)
12+
ts_start, id_start = cpucycle_id()
13+
for k 1:K
14+
_ptrx = ptrx
15+
for n 1:N>>$(VectorizationBase.intlog2(U))
16+
Base.Cartesian.@nexprs $U u -> begin
17+
v_u = vload(Vec{$W,$T}, _ptrx)
18+
s_u = vadd(s_u, f(v_u))
19+
_ptrx += VectorizationBase.REGISTER_SIZE
20+
end
21+
# v = vload(V, _ptrx)
22+
# s = vadd(s, f(v))
23+
# _ptrx += VectorizationBase.REGISTER_SIZE
24+
end
25+
end
26+
ts_end, id_end = cpucycle_id()
27+
@assert id_start == id_end
28+
Base.Cartesian.@nexprs $(U-1) u -> s_1 = vadd(s_1, s_{u+1})
29+
(ts_end - ts_start) / (N*K), vsum(s_1)
30+
end
31+
end
32+
estimate_cost(SLEEFPirates.exp, 512, 1_000, Float64, Val(4)) # 28
33+
34+
estimate_cost(SLEEFPirates.log, 512, 1_000, Float64, Val(1)) # 51 cycles
35+
estimate_cost(SLEEFPirates.log, 512, 1_000, Float64, Val(2)) # 51 cycles
36+
estimate_cost(SLEEFPirates.log, 512, 1_000, Float64, Val(4)) # 51 cycles
37+
estimate_cost(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(1)) # 23 cycles
38+
estimate_cost(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(2)) # 23 cycles
39+
estimate_cost(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(4)) # 23 cycles
40+
estimate_cost(SIMDPirates.vinv, 512, 1_000, Float64, Val(1)) # 23 cycles
41+
estimate_cost(SIMDPirates.vinv, 512, 1_000, Float64, Val(2)) # 23 cycles
42+
estimate_cost(SIMDPirates.vinv, 512, 1_000, Float64, Val(4)) # 23 cycles
43+
44+
const cz = ntuple(Val(4)) do i Core.VecElement(randn()) end
45+
# @code_native debuginfo=:none
46+
estimate_cost(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(1)) # 4.5 cycles
47+
estimate_cost(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(2)) # 2 cycles
48+
estimate_cost(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(4)) # 1 cycles
49+
450
@testset "LoopVectorization.jl" begin
551
# Write your own tests here.
652
end

0 commit comments

Comments
 (0)