Skip to content

Commit 7bdcfec

Browse files
committed
A few updates.
1 parent a3eddb1 commit 7bdcfec

File tree

5 files changed

+182
-20
lines changed

5 files changed

+182
-20
lines changed

README.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,36 @@ Pkg.add(PackageSpec(url="https://github.com/chriselrod/SIMDPirates.jl"))
1414
Pkg.add(PackageSpec(url="https://github.com/chriselrod/SLEEFPirates.jl"))
1515
Pkg.add(PackageSpec(url="https://github.com/chriselrod/LoopVectorization.jl"))
1616
```
17+
18+
19+
## Usage
20+
21+
The current version of LoopVectorization provides a simple, dumb, transform on a single loop.
22+
What I mean by this is that it will not check for the transformations for validity. To be safe, I would straight loops that transform arrays or calculate reductions.
23+
24+
For example,
25+
```julia
26+
function sum_simd(x)
27+
s = zero(eltype(x))
28+
@simd for xᵢ x
29+
s += xᵢ
30+
end
31+
s
32+
end
33+
using LoopVectorization, BenchmarkTools
34+
function sum_loopvec(x::AbstractVector{Float64})
35+
s = 0.0
36+
@vvectorize for i eachindex(x)
37+
s += x[i]
38+
end
39+
s
40+
end
41+
x = rand(99);
42+
@btime sum($x)
43+
44+
@btime sum_simd($x)
45+
@btime sum_loopvec($x)
46+
```
47+
48+
49+

src/LoopVectorization.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ module LoopVectorization
22

33
using VectorizationBase, SIMDPirates, SLEEFPirates, MacroTools
44
using VectorizationBase: REGISTER_SIZE, extract_data, num_vector_load_expr
5-
using SIMDPirates: VECTOR_SYMBOLS
5+
using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul
66
using MacroTools: @capture, prewalk, postwalk
77

88
export vectorizable, @vectorize, @vvectorize

src/costs.jl

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,49 +9,79 @@
99
struct InstructionCost
1010
scalar_latency::Int
1111
scalar_reciprical_throughput::Float64
12-
scaling::Float64 # sentinel values: -2 == no scaling; -1 == scaling, >0 -> == latency == reciprical throughput
13-
12+
scaling::Float64 # sentinel values: -3 == no scaling; -2 == offset_scaling, -1 == linear scaling, >0 -> == latency == reciprical throughput
13+
register_pressure::Int
1414
end
15-
InstructionCost(sl, srt) = InstructionCost(sl, srt, NoCost)
15+
InstructionCost(sl, srt, scaling = -3.0) = InstructionCost(sl, srt, scaling, 0)
1616

1717
function scalar_cost(instruction::InstructionCost)#, ::Type{T} = Float64) where {T}
1818
instruction.scalar_latency, instruction.scalar_reciprical_throughput
1919
end
2020
function vector_cost(instruction::InstructionCost, Wshift, ::Type{T} = Float64) where {T}
2121
sl, srt = scalar_cost(instruction)
2222
scaling = instruction.scaling
23-
if scaling == NoCost || Wshift == 0
24-
returnsl, srt
25-
elseif scaling == Linear
23+
if scaling == -3.0 || Wshift == 0
24+
return sl, srt
25+
elseif scaling == -2.0
2626
srt *= 1 << (Wshift + VectorizationBase.intlog2(sizeof(T)) - 4)
2727
if (sizeof(T) << Wshift) == VectorizationBase.REGISTER_SIZE # These instructions experience double latency with zmm
2828
sl += sl
2929
end
30-
end
31-
30+
elseif scaling == -1.0
31+
W = 1 << Wshift
32+
extra_latency = sl - srt
33+
srt *= W
34+
sl = srt + extra_latency
35+
else
36+
sl, srt = scaling, scaling
37+
end
3238
sl, srt
3339
end
3440
function cost(instruction::InstructionCost, Wshift, ::Type{T}) where {T}
3541
Wshift == 0 ? scalar_cost(instruction) : vector_cost(instruction, Wshift, T)
3642
end
3743

44+
# Just a semi-reasonable assumption; should not be that sensitive to anything other than loads
45+
const OPAQUE_COST = InstructionSet(50.0, 50.0, -1.0, 32)
46+
3847
const COST = Dict{Symbol,InstructionCost}(
3948
:getindex => InstructionCost(3,0.5),
4049
:setindex! => InstructionCost(3,1.0), # but not a part of dependency chains, so not really twice as expensive?
4150
:+ => InstructionCost(4,0.5),
4251
:- => InstructionCost(4,0.5),
4352
:* => InstructionCost(4,0.5),
44-
:/ => InstructionCost(13,4.0,),
53+
:/ => InstructionCost(13,4.0,-2.0),
54+
:inv => InstructionCost(13,4.0,-2.0,1),
4555
:muladd => InstructionCost(0.5,4), # + and * will fuse into this, so much of the time they're not twice as expensive
46-
:sqrt => InstructionCost(),
47-
:log => InstructionCost(,,52.5),
48-
:exp => InstructionCost(,,30.0),
49-
:sin => InstructionCost(),
50-
:cos => InstructionCost(),
51-
:sincos => InstructionCost(),
52-
:
56+
:sqrt => InstructionCost(15,4.0,-2.0),
57+
:log => InstructionCost(20,20.0,40.0,20),
58+
:exp => InstructionCost(20,20.0,20.0,18),
59+
:sin => InstructionCost(18,15.0,68.0,23),
60+
:cos => InstructionCost(18,15.0,68.0,26),
61+
:sincos => InstructionCost(25,22.0,70.0,26)
5362
)
5463

64+
function sum_simd(x)
65+
s = zero(eltype(x))
66+
@simd for xᵢ x
67+
s += xᵢ
68+
end
69+
s
70+
end
71+
using LoopVectorization, BenchmarkTools
72+
function sum_loopvec(x::AbstractVector{Float64})
73+
s = 0.0
74+
@vvectorize 4 for i eachindex(x)
75+
s += x[i]
76+
end
77+
s
78+
end
79+
x = rand(99);
80+
@btime sum($x)
81+
@btime sum_simd($x)
82+
@btime sum_loopvec($x)
83+
84+
5585
# const SIMDPIRATES_COST = Dict{Symbol,InstructionCost}()
5686
# const SLEEFPIRATES_COST = Dict{Symbol,InstructionCost}()
5787

src/graphs.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ end
3131
function evaluate_cost(
3232
ls::LoopSet, order::ShortIntVector
3333
)
34-
34+
included_vars = Set{Symbol}
3535
end
3636

3737
# Here, we have to figure out how to convert the loopset into a vectorized expression.

test/runtests.jl

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,100 @@ end
224224
(ts_end - ts_start) / (2N*K), vsum(s_1)
225225
end
226226
end
227+
228+
229+
230+
@generated function estimate_cost_onearg_llvmunroll(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
231+
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
232+
Ushift = VectorizationBase.intlog2(U)
233+
W <<= Ushift
234+
quote
235+
s = vbroadcast(Vec{$W,$T}, zero(T))
236+
x = rand(T, N << $Wshift)
237+
ptrx = pointer(x)
238+
ts_start, id_start = cpucycle_id()
239+
for k 1:K
240+
_ptrx = ptrx
241+
for n 1:N>>$Ushift
242+
v = vload(Vec{$W,$T}, _ptrx)
243+
s = vadd(s, f(v))
244+
_ptrx += VectorizationBase.REGISTER_SIZE*$U
245+
end
246+
end
247+
ts_end, id_end = cpucycle_id()
248+
@assert id_start == id_end
249+
(ts_end - ts_start) / (N*K), vsum(s)
250+
end
251+
end
252+
@generated function estimate_cost_onearg_tworet_llvmunroll(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
253+
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
254+
Ushift = VectorizationBase.intlog2(U)
255+
W <<= Ushift
256+
quote
257+
s = vbroadcast(Vec{$W,$T}, zero(T))
258+
x = rand(T, N << $Wshift)
259+
ptrx = pointer(x)
260+
ts_start, id_start = cpucycle_id()
261+
for k 1:K
262+
_ptrx = ptrx
263+
for n 1:N>>$Ushift
264+
v = vload(Vec{$W,$T}, _ptrx)
265+
a, b = f(v)
266+
s = vmuladd(a, b, s)
267+
_ptrx += VectorizationBase.REGISTER_SIZE*$U
268+
end
269+
end
270+
ts_end, id_end = cpucycle_id()
271+
@assert id_start == id_end
272+
(ts_end - ts_start) / (N*K), vsum(s)
273+
end
274+
end
275+
@generated function estimate_cost_twoarg_llvmunroll(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
276+
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
277+
Ushift = VectorizationBase.intlog2(U)
278+
W <<= Ushift
279+
quote
280+
s = vbroadcast(Vec{$W,$T}, one(T))
281+
x = rand(T, N << $Wshift)
282+
ptrx = pointer(x)
283+
ts_start, id_start = cpucycle_id()
284+
for k 1:K
285+
_ptrx = ptrx
286+
for n 1:N>>$Ushift
287+
v = vload(Vec{$W,$T}, _ptrx)
288+
s = f(v, s)
289+
_ptrx += VectorizationBase.REGISTER_SIZE*$U
290+
end
291+
end
292+
ts_end, id_end = cpucycle_id()
293+
@assert id_start == id_end
294+
(ts_end - ts_start) / (N*K), vsum(s)
295+
end
296+
end
297+
@generated function estimate_cost_threearg_llvmunroll(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
298+
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
299+
Ushift = VectorizationBase.intlog2(U)
300+
W <<= Ushift
301+
quote
302+
s = vbroadcast(Vec{$W,$T}, zero(T))
303+
x = rand(T, N << $Wshift)
304+
ptrx = pointer(x)
305+
ts_start, id_start = cpucycle_id()
306+
for k 1:K
307+
_ptrx = ptrx
308+
for n 1:N>>$Ushift
309+
v = vload(Vec{$W,$T}, _ptrx)
310+
s = f(v, v, s)
311+
_ptrx += VectorizationBase.REGISTER_SIZE*$U
312+
end
313+
end
314+
ts_end, id_end = cpucycle_id()
315+
@assert id_start == id_end
316+
(ts_end - ts_start) / (N*K), vsum(s)
317+
end
318+
end
319+
320+
227321
estimate_cost_onearg_serial(exp, 512, 1_000, Float64, Val(1)) # 21
228322
estimate_cost_onearg_serial(exp, 512, 1_000, Float64, Val(2)) # 18.4
229323
estimate_cost_onearg_serial(exp, 512, 1_000, Float64, Val(4)) # 17.5
@@ -254,10 +348,15 @@ estimate_cost_onearg_tworet_serial(sincos, 512, 1_000, Float64, Val(2)) # 23
254348
estimate_cost_onearg_tworet_serial(sincos, 512, 1_000, Float64, Val(4)) # 22
255349

256350

257-
estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(1)) # 28 # 21
258-
estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(2)) # 28 # 20
351+
estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(1)) # 48 # 21
352+
estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(2)) # 52 # 20
259353
estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(4)) # 28 # 19.5
260354

355+
estimate_cost_onearg_llvmunroll(SLEEFPirates.exp, 512, 1_000, Float64, Val(1)) # 50 # 21
356+
estimate_cost_onearg_llvmunroll(SLEEFPirates.exp, 512, 1_000, Float64, Val(2)) # 40 # 20
357+
# estimate_cost_onearg_llvmunroll(SLEEFPirates.exp, 512, 1_000, Float64, Val(3)) # 40 #
358+
estimate_cost_onearg_llvmunroll(SLEEFPirates.exp, 512, 1_000, Float64, Val(4)) # 32 # 19.5
359+
261360
estimate_cost_onearg(SLEEFPirates.log, 512, 1_000, Float64, Val(1)) # 51 cycles # 44
262361
estimate_cost_onearg(SLEEFPirates.log, 512, 1_000, Float64, Val(2)) # 51 cycles # 40
263362
estimate_cost_onearg(SLEEFPirates.log, 512, 1_000, Float64, Val(4)) # 51 cycles # 39

0 commit comments

Comments
 (0)