Skip to content

Commit 2c99f6a

Browse files
committed
Merge branch 'graphs' of https://github.com/chriselrod/LoopVectorization.jl into graphs
2 parents 408d0ac + db9a348 commit 2c99f6a

File tree

4 files changed

+331
-27
lines changed

4 files changed

+331
-27
lines changed

README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,43 @@ Pkg.add(PackageSpec(url="https://github.com/chriselrod/SIMDPirates.jl"))
1414
Pkg.add(PackageSpec(url="https://github.com/chriselrod/SLEEFPirates.jl"))
1515
Pkg.add(PackageSpec(url="https://github.com/chriselrod/LoopVectorization.jl"))
1616
```
17+
18+
19+
## Usage
20+
21+
The current version of LoopVectorization provides a simple, dumb, transform on a single loop.
22+
What I mean by this is that it will not check for the transformations for validity. To be safe, I would straight loops that transform arrays or calculate reductions.
23+
24+
For example,
25+
```julia
26+
function sum_simd(x)
27+
s = zero(eltype(x))
28+
@simd for xᵢ x
29+
s += xᵢ
30+
end
31+
s
32+
end
33+
using LoopVectorization, BenchmarkTools
34+
function sum_loopvec(x::AbstractVector{Float64})
35+
s = 0.0
36+
@vvectorize 4 for i eachindex(x)
37+
s += x[i]
38+
end
39+
s
40+
end
41+
x = rand(110);
42+
@btime sum($x)
43+
# 20.527 ns (0 allocations: 0 bytes)
44+
# 53.38001667116997
45+
46+
@btime sum_simd($x)
47+
# 16.749 ns (0 allocations: 0 bytes)
48+
# 53.38001667116997
49+
50+
@btime sum_loopvec($x)
51+
# 12.022 ns (0 allocations: 0 bytes)
52+
# 53.38001667116997
53+
```
54+
55+
56+

src/costs.jl

Lines changed: 54 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,49 +9,87 @@
99
struct InstructionCost
1010
scalar_latency::Int
1111
scalar_reciprical_throughput::Float64
12-
scaling::Float64 # sentinel values: -2 == no scaling; -1 == scaling, >0 -> == latency == reciprical throughput
13-
12+
scaling::Float64 # sentinel values: -3 == no scaling; -2 == offset_scaling, -1 == linear scaling, >0 -> == latency == reciprical throughput
13+
register_pressure::Int
1414
end
15-
InstructionCost(sl, srt) = InstructionCost(sl, srt, NoCost)
15+
InstructionCost(sl, srt, scaling = -3.0) = InstructionCost(sl, srt, scaling, 0)
1616

1717
function scalar_cost(instruction::InstructionCost)#, ::Type{T} = Float64) where {T}
1818
instruction.scalar_latency, instruction.scalar_reciprical_throughput
1919
end
2020
function vector_cost(instruction::InstructionCost, Wshift, ::Type{T} = Float64) where {T}
2121
sl, srt = scalar_cost(instruction)
2222
scaling = instruction.scaling
23-
if scaling == NoCost || Wshift == 0
24-
returnsl, srt
25-
elseif scaling == Linear
23+
if scaling == -3.0 || Wshift == 0
24+
return sl, srt
25+
elseif scaling == -2.0
2626
srt *= 1 << (Wshift + VectorizationBase.intlog2(sizeof(T)) - 4)
2727
if (sizeof(T) << Wshift) == VectorizationBase.REGISTER_SIZE # These instructions experience double latency with zmm
2828
sl += sl
2929
end
30-
end
31-
30+
elseif scaling == -1.0
31+
W = 1 << Wshift
32+
extra_latency = sl - srt
33+
srt *= W
34+
sl = srt + extra_latency
35+
else
36+
sl, srt = scaling, scaling
37+
end
3238
sl, srt
3339
end
3440
function cost(instruction::InstructionCost, Wshift, ::Type{T}) where {T}
3541
Wshift == 0 ? scalar_cost(instruction) : vector_cost(instruction, Wshift, T)
3642
end
3743

44+
# Just a semi-reasonable assumption; should not be that sensitive to anything other than loads
45+
const OPAQUE_INSTRUCTION = InstructionSet(50.0, 50.0, -1.0, 32)
46+
3847
const COST = Dict{Symbol,InstructionCost}(
3948
:getindex => InstructionCost(3,0.5),
4049
:setindex! => InstructionCost(3,1.0), # but not a part of dependency chains, so not really twice as expensive?
4150
:+ => InstructionCost(4,0.5),
4251
:- => InstructionCost(4,0.5),
4352
:* => InstructionCost(4,0.5),
44-
:/ => InstructionCost(13,4.0,),
53+
:/ => InstructionCost(13,4.0,-2.0),
54+
:== => InstructionCost(1, 0.5),
55+
:isequal => InstructionCost(1, 0.5),
56+
:& => InstructionCost(1, 0.5),
57+
:| => InstructionCost(1, 0.5),
58+
:> => InstructionCost(1, 0.5),
59+
:< => InstructionCost(1, 0.5),
60+
:>= => InstructionCost(1, 0.5),
61+
:<= => InstructionCost(1, 0.5),
62+
:inv => InstructionCost(13,4.0,-2.0,1),
4563
:muladd => InstructionCost(0.5,4), # + and * will fuse into this, so much of the time they're not twice as expensive
46-
:sqrt => InstructionCost(),
47-
:log => InstructionCost(,,52.5),
48-
:exp => InstructionCost(,,30.0),
49-
:sin => InstructionCost(),
50-
:cos => InstructionCost(),
51-
:sincos => InstructionCost(),
52-
:
64+
:sqrt => InstructionCost(15,4.0,-2.0),
65+
:log => InstructionCost(20,20.0,40.0,20),
66+
:exp => InstructionCost(20,20.0,20.0,18),
67+
:sin => InstructionCost(18,15.0,68.0,23),
68+
:cos => InstructionCost(18,15.0,68.0,26),
69+
:sincos => InstructionCost(25,22.0,70.0,26)
5370
)
5471

72+
function sum_simd(x)
73+
s = zero(eltype(x))
74+
@simd for xᵢ x
75+
s += xᵢ
76+
end
77+
s
78+
end
79+
using LoopVectorization, BenchmarkTools
80+
function sum_loopvec(x::AbstractVector{Float64})
81+
s = 0.0
82+
@vvectorize 4 for i eachindex(x)
83+
s += x[i]
84+
end
85+
s
86+
end
87+
x = rand(111);
88+
@btime sum($x)
89+
@btime sum_simd($x)
90+
@btime sum_loopvec($x)
91+
92+
5593
# const SIMDPIRATES_COST = Dict{Symbol,InstructionCost}()
5694
# const SLEEFPIRATES_COST = Dict{Symbol,InstructionCost}()
5795

src/graphs.jl

Lines changed: 136 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,27 @@
11
# using LightGraphs
22

3-
struct ShortIntVector
4-
data::Vector{Int}
3+
4+
"""
5+
ShortVector{T} simply wraps a Vector{T}, but uses a different hash function that is faster for short vectors to support using it as the keys of a Dict.
6+
This hash function scales O(N) with length of the vectors, so it is slow for long vectors.
7+
"""
8+
struct ShortVector{T} <: DenseVector{T}
9+
data::Vector{T}
510
end
6-
function Base.hash(x::ShortIntVector, h::UInt)
7-
d = x.data
8-
@inbounds for n eachindex(d)
9-
h = hash(d[n], h)
11+
Base.@propagate_inbounds Base.getindex(x::ShortVector, I...) = x.data[I...]
12+
Base.@propagate_inbounds Base.setindex!(x::ShortVector, v, I...) = x.data[I...] = v
13+
@inbounds Base.length(x::ShortVector) = length(x.data)
14+
@inbounds Base.size(x::ShortVector) = size(x.data)
15+
@inbounds Base.strides(x::ShortVector) = strides(x.data)
16+
@inbounds Base.push!(x::ShortVector, v) = push!(x.data, v)
17+
@inbounds Base.append!(x::ShortVector, v) = append!(x.data, v)
18+
function Base.hash(x::ShortVector, h::UInt)
19+
@inbounds for n eachindex(x)
20+
h = hash(x[n], h)
1021
end
1122
h
1223
end
1324

14-
1525
@enum NodeType begin
1626
input
1727
store
@@ -27,11 +37,128 @@ struct LoopSet
2737

2838
end
2939

40+
function Base.length(ls::LoopSet, is::Symbol)
41+
42+
end
43+
function variables(ls::LoopSet)
44+
45+
end
46+
function loopdependencies(var::Variable)
47+
48+
end
49+
function sym(var::Variable)
50+
51+
end
52+
function instruction(var::Variable)
53+
54+
end
55+
function accesses_memory(var::Variable)
56+
57+
end
58+
function stride(var::Variable, sym::Symbol)
59+
60+
end
61+
function cost(var::Variable, unrolled::Symbol, dim::Int)
62+
c = cost(instruction(var), Wshift, T)::Int
63+
if accesses_memory(var) && stride(var, unrolled) != 1
64+
c *= W
65+
end
66+
c
67+
end
68+
function Base.eltype(var::Variable)
69+
Base._return_type()
70+
end
71+
function biggest_type(ls::LoopSet)
72+
73+
end
74+
3075
# evaluates cost of evaluating loop in given order
3176
function evaluate_cost(
32-
ls::LoopSet, order::ShortIntVector
77+
ls::LoopSet, order::ShortVector{Symbol}, max_cost = typemax(Int)
3378
)
34-
79+
included_vars = Set{Symbol}()
80+
nested_loop_syms = Set{Symbol}()
81+
total_cost = 0.0
82+
iter = 1.0
83+
unrolled = last(order)
84+
W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, unrolled), biggest_type(ls))::Tuple{Int,Int}
85+
86+
fused_with_previous = fill(false, length(order))
87+
for itersym order
88+
# Add to set of defined symbles
89+
push!(nested_loop_syms, itersym)
90+
liter = length(ls, itersym)
91+
if itersym == unrolled
92+
liter /= W
93+
end
94+
iter *= liter
95+
# check which vars we can define at this level of loop nest
96+
added_vars = 0
97+
for (var,instruction) variables(ls)
98+
# won't define if already defined...
99+
sym(var) included_vars && continue
100+
# it must also be a subset of defined symbols
101+
loopdependencies(var) nested_loop_syms || continue
102+
added_vars += 1
103+
push!(included_vars, sym(var))
104+
105+
total_cost += iter * cost(var, W, Wshift, unrolled, liter)
106+
total_cost > max_cost && return total_cost # abort
107+
end
108+
if added_vars == 0
109+
# Then it is worth checking if we can fuse with previous
110+
end
111+
end
112+
end
113+
114+
struct LoopOrders
115+
syms::Vector{Symbol}
116+
end
117+
function Base.iterate(lo::LoopOrders)
118+
ShortVector(lo.syms), zeros(Int, length(lo.syms))# - 1)
119+
end
120+
121+
function swap!(x, i, j)
122+
xᵢ, xⱼ = x[i], x[j]
123+
x[j], x[i] = xᵢ, xⱼ
124+
end
125+
function advance_state!(state)
126+
N = length(state)
127+
for n 1:N
128+
sₙ = state[n]
129+
if sₙ == N - n
130+
if n == N
131+
return false
132+
else
133+
state[n] = 0
134+
end
135+
else
136+
state[n] = sₙ + 1
137+
break
138+
end
139+
end
140+
true
141+
end
142+
# I doubt this is the most efficient algorithm, but it's the simplest thing
143+
# that I could come up with.
144+
function Base.iterate(lo::LoopOrders, state)
145+
advance_state!(state) || return nothing
146+
# @show state
147+
syms = copy(lo.syms)
148+
for i eachindex(state)
149+
sᵢ = state[i]
150+
sᵢ == 0 || swap!(syms, i, i + sᵢ)
151+
end
152+
ShortVector(syms), state
153+
end
154+
155+
function choose_order(ls::LoopSet)
156+
is = copy(itersyms(ls))
157+
best_cost = typemax(Int)
158+
for lo LoopOrders(ls)
159+
cost = evaluate_cost(ls, lo)
160+
161+
end
35162
end
36163

37164
# Here, we have to figure out how to convert the loopset into a vectorized expression.

0 commit comments

Comments
 (0)