Skip to content

Commit 0cdd1cb

Browse files
committed
Worked on a few more cost tests.
1 parent 4cf6786 commit 0cdd1cb

File tree

2 files changed

+286
-18
lines changed

2 files changed

+286
-18
lines changed

src/LoopVectorization.jl

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,11 @@ end
133133
throw("Type $Tsym is not supported.")
134134
end
135135
end
136-
@noinline function vectorize_body(N, T::DataType, unroll_factor::Int, n, body, vecdict = SLEEFPiratesDict, VType = SVec, gcpreserve::Bool = true, mod = :LoopVectorization)
136+
@noinline function vectorize_body(
137+
N, ::Type{T}, unroll_factor::Int, n::Symbol, body::Array{Any},
138+
vecdict::Dict{Symbol,Tuple{Symbol,Symbol}} = SLEEFPiratesDict,
139+
@nospecialize(VType = SVec), gcpreserve::Bool = true, mod = :LoopVectorization
140+
) where {T}
137141
# unroll_factor == 1 || throw("Only unroll factor of 1 is currently supported. Was set to $unroll_factor.")
138142
T_size = sizeof(T)
139143
if isa(N, Integer)
@@ -295,7 +299,7 @@ end
295299
pushfirst!(q.args, :($gsym = $mod.vbroadcast($V,one($T))))
296300
end
297301
end
298-
func = ((op == :*) | (op == :/)) ? :($mod.vmul) : :($mod.vadd)
302+
func = ((op == :*) | (op == :/)) ? :($mod.evmul) : :($mod.evadd)
299303
uf_new = unroll_factor
300304
while uf_new > 1
301305
uf_new, uf_prev = uf_new >> 1, uf_new

test/runtests.jl

Lines changed: 280 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,56 @@
11
using LoopVectorization
22
using Test
33

4-
using CpuId, VectorizationBase, SIMDPirates, SLEEFPirates
5-
@generated function estimate_cost(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
4+
using CpuId, VectorizationBase, SIMDPirates, SLEEFPirates, VectorizedRNG
5+
6+
@generated function estimate_cost_onearg_serial(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
7+
quote
8+
Base.Cartesian.@nexprs $U u -> s_u = zero(T)
9+
# s = vbroadcast(V, zero(T))
10+
x = rand(T, N)
11+
ptrx = pointer(x)
12+
ts_start, id_start = cpucycle_id()
13+
@inbounds for k 1:K
14+
i = 1
15+
for n 1:N>>$(VectorizationBase.intlog2(U))
16+
Base.Cartesian.@nexprs $U u -> begin
17+
v_u = x[i]
18+
i += 1
19+
s_u += f(v_u)
20+
end
21+
end
22+
end
23+
ts_end, id_end = cpucycle_id()
24+
@assert id_start == id_end
25+
Base.Cartesian.@nexprs $(U-1) u -> s_1 += s_{u+1}
26+
(ts_end - ts_start) / (N*K), s_1
27+
end
28+
end
29+
@generated function estimate_cost_onearg_tworet_serial(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
30+
quote
31+
Base.Cartesian.@nexprs $U u -> s_u = zero(T)
32+
# s = vbroadcast(V, zero(T))
33+
x = rand(T, N)
34+
ptrx = pointer(x)
35+
ts_start, id_start = cpucycle_id()
36+
@inbounds for k 1:K
37+
i = 1
38+
for n 1:N>>$(VectorizationBase.intlog2(U))
39+
Base.Cartesian.@nexprs $U u -> begin
40+
v_u = x[i]
41+
i += 1
42+
a_u, b_u = f(v_u)
43+
s_u = muladd(a_u,b_u,s_u)
44+
end
45+
end
46+
end
47+
ts_end, id_end = cpucycle_id()
48+
@assert id_start == id_end
49+
Base.Cartesian.@nexprs $(U-1) u -> s_1 += s_{u+1}
50+
(ts_end - ts_start) / (N*K), s_1
51+
end
52+
end
53+
@generated function estimate_cost_onearg(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
654
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
755
quote
856
Base.Cartesian.@nexprs $U u -> s_u = vbroadcast(Vec{$W,$T}, zero(T))
@@ -29,23 +77,239 @@ using CpuId, VectorizationBase, SIMDPirates, SLEEFPirates
2977
(ts_end - ts_start) / (N*K), vsum(s_1)
3078
end
3179
end
32-
estimate_cost(SLEEFPirates.exp, 512, 1_000, Float64, Val(4)) # 28
33-
34-
estimate_cost(SLEEFPirates.log, 512, 1_000, Float64, Val(1)) # 51 cycles
35-
estimate_cost(SLEEFPirates.log, 512, 1_000, Float64, Val(2)) # 51 cycles
36-
estimate_cost(SLEEFPirates.log, 512, 1_000, Float64, Val(4)) # 51 cycles
37-
estimate_cost(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(1)) # 23 cycles
38-
estimate_cost(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(2)) # 23 cycles
39-
estimate_cost(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(4)) # 23 cycles
40-
estimate_cost(SIMDPirates.vinv, 512, 1_000, Float64, Val(1)) # 23 cycles
41-
estimate_cost(SIMDPirates.vinv, 512, 1_000, Float64, Val(2)) # 23 cycles
42-
estimate_cost(SIMDPirates.vinv, 512, 1_000, Float64, Val(4)) # 23 cycles
80+
@generated function estimate_cost_onearg_tworet(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
81+
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
82+
quote
83+
Base.Cartesian.@nexprs $U u -> s_u = vbroadcast(Vec{$W,$T}, zero(T))
84+
# s = vbroadcast(V, zero(T))
85+
x = rand(T, N << $Wshift)
86+
ptrx = pointer(x)
87+
ts_start, id_start = cpucycle_id()
88+
for k 1:K
89+
_ptrx = ptrx
90+
for n 1:N>>$(VectorizationBase.intlog2(U))
91+
Base.Cartesian.@nexprs $U u -> begin
92+
v_u = vload(Vec{$W,$T}, _ptrx)
93+
a_u, b_u = f(v_u)
94+
s_u = vmuladd(a_u, b_u, s_u)
95+
_ptrx += VectorizationBase.REGISTER_SIZE
96+
end
97+
# v = vload(V, _ptrx)
98+
# s = vadd(s, f(v))
99+
# _ptrx += VectorizationBase.REGISTER_SIZE
100+
end
101+
end
102+
ts_end, id_end = cpucycle_id()
103+
@assert id_start == id_end
104+
Base.Cartesian.@nexprs $(U-1) u -> s_1 = vadd(s_1, s_{u+1})
105+
(ts_end - ts_start) / (N*K), vsum(s_1)
106+
end
107+
end
108+
@generated function estimate_cost_twoarg(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
109+
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
110+
if U == 1
111+
return quote
112+
Base.Cartesian.@nexprs $U u -> s_u = vbroadcast(Vec{$W,$T}, one(T))
113+
# s = vbroadcast(V, zero(T))
114+
x = rand(T, N << $Wshift)
115+
ptrx = pointer(x)
116+
ts_start, id_start = cpucycle_id()
117+
for k 1:K
118+
_ptrx = ptrx
119+
for n 1:N>>$(VectorizationBase.intlog2(U))
120+
Base.Cartesian.@nexprs $U u -> begin
121+
v_u = vload(Vec{$W,$T}, _ptrx)
122+
s_u = f(s_u, v_u)
123+
_ptrx += VectorizationBase.REGISTER_SIZE
124+
end
125+
# v = vload(V, _ptrx)
126+
# s = vadd(s, f(v))
127+
# _ptrx += VectorizationBase.REGISTER_SIZE
128+
end
129+
end
130+
ts_end, id_end = cpucycle_id()
131+
@assert id_start == id_end
132+
Base.Cartesian.@nexprs $(U-1) u -> s_1 = vadd(s_1, s_{u+1})
133+
(ts_end - ts_start) / (N*K), vsum(s_1)
134+
end
135+
end
136+
Uh = U >>> 1
137+
quote
138+
Base.Cartesian.@nexprs $(U << 1) u -> s_u = randn(VectorizedRNG.GLOBAL_vPCG, Vec{$W,$T}) #vbroadcast(Vec{$W,$T}, one(T))
139+
# s = vbroadcast(V, zero(T))
140+
x = rand(T, N << $Wshift)
141+
ptrx = pointer(x)
142+
ts_start, id_start = cpucycle_id()
143+
for k 1:K
144+
_ptrx = ptrx
145+
for n 1:N>>$(VectorizationBase.intlog2(U))
146+
Base.Cartesian.@nexprs $Uh u -> begin
147+
v_u = vload(Vec{$W,$T}, _ptrx)
148+
_ptrx += VectorizationBase.REGISTER_SIZE
149+
v_{u+$Uh} = vload(Vec{$W,$T}, _ptrx)
150+
_ptrx += VectorizationBase.REGISTER_SIZE
151+
# vv_u = vmul(v_u, v_{u+$Uh})
152+
s_u = f(s_u, v_u)
153+
s_{u+$Uh} = f(s_{u+$Uh}, v_{u+$Uh})
154+
s_{u+$U} = f(s_{u+$U}, v_u)
155+
s_{u+$(Uh+U)} = f(s_{u+$(Uh+U)}, v_{u+$Uh})
156+
end
157+
# v = vload(V, _ptrx)
158+
# s = vadd(s, f(v))
159+
# _ptrx += VectorizationBase.REGISTER_SIZE
160+
end
161+
end
162+
ts_end, id_end = cpucycle_id()
163+
@assert id_start == id_end
164+
Base.Cartesian.@nexprs $((U<<1)-1) u -> s_1 = vadd(s_1, s_{u+1})
165+
(ts_end - ts_start) / (2N*K), vsum(s_1)
166+
end
167+
end
168+
@generated function estimate_cost_threearg(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
169+
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
170+
if U == 1
171+
return quote
172+
Base.Cartesian.@nexprs $U u -> s_u = vbroadcast(Vec{$W,$T}, zero(T))
173+
# s = vbroadcast(V, zero(T))
174+
x = rand(T, N << $Wshift)
175+
ptrx = pointer(x)
176+
ts_start, id_start = cpucycle_id()
177+
for k 1:K
178+
_ptrx = ptrx
179+
for n 1:N>>$(VectorizationBase.intlog2(U))
180+
Base.Cartesian.@nexprs $U u -> begin
181+
v_u = vload(Vec{$W,$T}, _ptrx)
182+
s_u = f(v_u, v_u, s_u)
183+
_ptrx += VectorizationBase.REGISTER_SIZE
184+
end
185+
# v = vload(V, _ptrx)
186+
# s = vadd(s, f(v))
187+
# _ptrx += VectorizationBase.REGISTER_SIZE
188+
end
189+
end
190+
ts_end, id_end = cpucycle_id()
191+
@assert id_start == id_end
192+
Base.Cartesian.@nexprs $(U-1) u -> s_1 = vadd(s_1, s_{u+1})
193+
(ts_end - ts_start) / (N*K), vsum(s_1)
194+
end
195+
end
196+
Uh = U >>> 1
197+
quote
198+
Base.Cartesian.@nexprs $(U<<1) u -> s_u = vbroadcast(Vec{$W,$T}, zero(T))
199+
# s = vbroadcast(V, zero(T))
200+
x = rand(T, N << $Wshift)
201+
ptrx = pointer(x)
202+
ts_start, id_start = cpucycle_id()
203+
for k 1:K
204+
_ptrx = ptrx
205+
for n 1:N>>$(VectorizationBase.intlog2(U))
206+
Base.Cartesian.@nexprs $Uh u -> begin
207+
v_u = vload(Vec{$W,$T}, _ptrx)
208+
_ptrx += VectorizationBase.REGISTER_SIZE
209+
v_{u+$Uh} = vload(Vec{$W,$T}, _ptrx)
210+
_ptrx += VectorizationBase.REGISTER_SIZE
211+
s_u = f(v_u, v_u, s_u)
212+
s_{u+$Uh} = f(v_{u+$Uh}, v_{u+$Uh}, s_{u+$Uh})
213+
s_{u+$U} = f(v_u, v_{u+$Uh}, s_{u+$U})
214+
s_{u+$(Uh+U)} = f(v_u, v_{u+$Uh}, s_{u+$(Uh+U)})
215+
end
216+
# v = vload(V, _ptrx)
217+
# s = vadd(s, f(v))
218+
# _ptrx += VectorizationBase.REGISTER_SIZE
219+
end
220+
end
221+
ts_end, id_end = cpucycle_id()
222+
@assert id_start == id_end
223+
Base.Cartesian.@nexprs $((U<<1) - 1) u -> s_1 = vadd(s_1, s_{u+1})
224+
(ts_end - ts_start) / (2N*K), vsum(s_1)
225+
end
226+
end
227+
estimate_cost_onearg_serial(exp, 512, 1_000, Float64, Val(1)) # 21
228+
estimate_cost_onearg_serial(exp, 512, 1_000, Float64, Val(2)) # 18.4
229+
estimate_cost_onearg_serial(exp, 512, 1_000, Float64, Val(4)) # 17.5
230+
231+
estimate_cost_onearg_serial(log, 512, 1_000, Float64, Val(1)) # 22
232+
estimate_cost_onearg_serial(log, 512, 1_000, Float64, Val(2)) # 19
233+
estimate_cost_onearg_serial(log, 512, 1_000, Float64, Val(4)) # 19
234+
235+
estimate_cost_onearg_serial(Base.FastMath.sqrt_fast, 512, 1_000, Float64, Val(1)) # 5
236+
estimate_cost_onearg_serial(Base.FastMath.sqrt_fast, 512, 1_000, Float64, Val(2)) # 2.5 # SIMD
237+
estimate_cost_onearg_serial(Base.FastMath.sqrt_fast, 512, 1_000, Float64, Val(4)) # 1.25 # SIMD
238+
@code_native debuginfo=:none estimate_cost_onearg_serial(Base.FastMath.sqrt_fast, 512, 1_000, Float64, Val(4)) # 1.25
239+
estimate_cost_onearg_serial(sqrt, 512, 1_000, Float64, Val(1)) # 5
240+
estimate_cost_onearg_serial(sqrt, 512, 1_000, Float64, Val(2)) # 2.5 # SIMD
241+
estimate_cost_onearg_serial(sqrt, 512, 1_000, Float64, Val(4)) # 1.25 # SIMD
242+
@code_native debuginfo=:none estimate_cost_onearg_serial(sqrt, 512, 1_000, Float64, Val(4)) # 1.25
243+
244+
estimate_cost_onearg_serial(sin, 512, 1_000, Float64, Val(1)) # 18
245+
estimate_cost_onearg_serial(sin, 512, 1_000, Float64, Val(2)) # 15
246+
estimate_cost_onearg_serial(sin, 512, 1_000, Float64, Val(4)) # 15
247+
248+
estimate_cost_onearg_serial(cos, 512, 1_000, Float64, Val(1)) # 19
249+
estimate_cost_onearg_serial(cos, 512, 1_000, Float64, Val(2)) # 16
250+
estimate_cost_onearg_serial(cos, 512, 1_000, Float64, Val(4)) # 16
251+
252+
estimate_cost_onearg_tworet_serial(sincos, 512, 1_000, Float64, Val(1)) # 25
253+
estimate_cost_onearg_tworet_serial(sincos, 512, 1_000, Float64, Val(2)) # 23
254+
estimate_cost_onearg_tworet_serial(sincos, 512, 1_000, Float64, Val(4)) # 22
255+
256+
257+
estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(1)) # 28 # 21
258+
estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(2)) # 28 # 20
259+
estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(4)) # 28 # 19.5
260+
261+
estimate_cost_onearg(SLEEFPirates.log, 512, 1_000, Float64, Val(1)) # 51 cycles # 44
262+
estimate_cost_onearg(SLEEFPirates.log, 512, 1_000, Float64, Val(2)) # 51 cycles # 40
263+
estimate_cost_onearg(SLEEFPirates.log, 512, 1_000, Float64, Val(4)) # 51 cycles # 39
264+
265+
estimate_cost_onearg(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(1)) # 23 cycles # 20
266+
estimate_cost_onearg(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(2)) # 23 cycles # 20
267+
estimate_cost_onearg(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(4)) # 23 cycles # 20
268+
269+
estimate_cost_onearg(SIMDPirates.vinv, 512, 1_000, Float64, Val(1)) # 23 cycles # 13.4
270+
estimate_cost_onearg(SIMDPirates.vinv, 512, 1_000, Float64, Val(2)) # 23 cycles # 13.4
271+
estimate_cost_onearg(SIMDPirates.vinv, 512, 1_000, Float64, Val(4)) # 23 cycles # 13.4
272+
273+
estimate_cost_onearg(SLEEFPirates.sin, 512, 1_000, Float64, Val(1)) # cycles # 68
274+
estimate_cost_onearg(SLEEFPirates.sin, 512, 1_000, Float64, Val(2)) # cycles # 66
275+
estimate_cost_onearg(SLEEFPirates.sin, 512, 1_000, Float64, Val(4)) # cycles # 66
276+
277+
estimate_cost_onearg(SLEEFPirates.cos, 512, 1_000, Float64, Val(1)) # cycles # 65
278+
estimate_cost_onearg(SLEEFPirates.cos, 512, 1_000, Float64, Val(2)) # cycles # 68
279+
estimate_cost_onearg(SLEEFPirates.cos, 512, 1_000, Float64, Val(4)) # cycles # 66
280+
281+
estimate_cost_onearg_tworet(SLEEFPirates.sincos, 512, 1_000, Float64, Val(1)) # cycles # 71
282+
estimate_cost_onearg_tworet(SLEEFPirates.sincos, 512, 1_000, Float64, Val(2)) # cycles # 71
283+
estimate_cost_onearg_tworet(SLEEFPirates.sincos, 512, 1_000, Float64, Val(4)) # cycles # 68
43284

44285
const cz = ntuple(Val(4)) do i Core.VecElement(randn()) end
45286
# @code_native debuginfo=:none
46-
estimate_cost(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(1)) # 4.5 cycles
47-
estimate_cost(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(2)) # 2 cycles
48-
estimate_cost(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(4)) # 1 cycles
287+
estimate_cost_onearg(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(1)) # 4.5 cycles # 3.35
288+
estimate_cost_onearg(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(2)) # 2 cycles # 1.66
289+
estimate_cost_onearg(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(4)) # 1 cycles # 1
290+
estimate_cost_onearg(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(8)) # cycles # 0.65
291+
292+
estimate_cost_twoarg(SIMDPirates.vmul, 1<<9, 10^3, Float64, Val(1)) # cycles # 3.3
293+
estimate_cost_twoarg(SIMDPirates.vmul, 1<<9, 10^3, Float64, Val(2)) # cycles # 0.97
294+
estimate_cost_twoarg(SIMDPirates.vmul, 1<<9, 10^3, Float64, Val(4)) # cycles # 0.52
295+
estimate_cost_twoarg(SIMDPirates.vmul, 1<<9, 10^3, Float64, Val(8)) # cycles # 0.51
296+
estimate_cost_twoarg(SIMDPirates.evmul, 1<<9, 10^3, Float64, Val(8)) # cycles # 0.51
297+
estimate_cost_twoarg(SIMDPirates.vadd, 1<<9, 10^3, Float64, Val(1)) # cycles # 3.3
298+
estimate_cost_twoarg(SIMDPirates.vadd, 1<<9, 10^3, Float64, Val(2)) # cycles # 0.97
299+
estimate_cost_twoarg(SIMDPirates.vadd, 1<<9, 10^3, Float64, Val(4)) # cycles # 0.52
300+
estimate_cost_twoarg(SIMDPirates.vadd, 1<<9, 10^3, Float64, Val(8)) # cycles # 0.5
301+
estimate_cost_twoarg(SIMDPirates.evadd, 1<<9, 10^3, Float64, Val(8)) # cycles # 0.5
302+
303+
@code_native debuginfo=:none estimate_cost_twoarg(SIMDPirates.vmul, 1<<9, 10^3, Float64, Val(8)) # cycles # 0.64
304+
@code_native debuginfo=:none estimate_cost_twoarg(SIMDPirates.evmul, 1<<9, 10^3, Float64, Val(8)) # cycles # 0.64
305+
306+
307+
estimate_cost_threearg(SIMDPirates.vmuladd, 1<<9, 10^3, Float64, Val(1)) # cycles # 3.3
308+
estimate_cost_threearg(SIMDPirates.vmuladd, 1<<9, 10^3, Float64, Val(2)) # cycles # 0.99
309+
estimate_cost_threearg(SIMDPirates.vmuladd, 1<<9, 10^3, Float64, Val(4)) # cycles # 0.54
310+
estimate_cost_threearg(SIMDPirates.vmuladd, 1<<9, 10^3, Float64, Val(8)) # cycles # 0.533
311+
@code_native debuginfo=:none estimate_cost_threearg(SIMDPirates.vmuladd, 1<<9, 10^3, Float64, Val(8)) # cycles # 0.537
312+
@code_native debuginfo=:none estimate_cost_threearg(SIMDPirates.vfmadd, 1<<9, 10^3, Float64, Val(8)) # cycles # 0.85
49313

50314
@testset "LoopVectorization.jl" begin
51315
# Write your own tests here.

0 commit comments

Comments
 (0)