Skip to content

Commit c1c8236

Browse files
committed
Continued progress.
1 parent f7afbe4 commit c1c8236

18 files changed

+288
-179
lines changed

benchmark/benchmarkflops.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,9 +258,9 @@ function dot3_bench!(br, s, i)
258258
x = rand(M); A = rand(M, N); y = rand(N);
259259
dotblas = dot(x, A, y)
260260
n_gflop = M*N * 3e-9
261-
br[1,i] = n_gflop / @belapsed jdot3avx($x, $A, $y)
261+
br[1,i] = n_gflop / @belapsed jdot3v2avx($x, $A, $y)
262262
@assert jdot3avx(x, A, y) dotblas "LoopVec dot wrong?"
263-
br[2,i] = n_gflop / @belapsed jdot3($x, $A, $y)
263+
br[2,i] = n_gflop / @belapsed jdot3v2($x, $A, $y)
264264
@assert jdot3(x, A, y) dotblas "Julia dot wrong?"
265265
br[3,i] = n_gflop / @belapsed cdot3($x, $A, $y)
266266
@assert cdot3(x, A, y) dotblas "Clang dot wrong?"

benchmark/driver.jl

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -22,33 +22,34 @@ end
2222
# sizes = 23:23
2323
sizes = 256:-1:2
2424

25-
@show AmulB_bench = benchmark_AmulB(sizes);
26-
@show AmulBt_bench = benchmark_AmulBt(sizes);
27-
@show AtmulBt_bench = benchmark_AtmulBt(sizes);
28-
@show AtmulB_bench = benchmark_AtmulB(sizes);
25+
AmulB_bench = benchmark_AmulB(sizes); println("A * B benchmark results:"); println(AmulB_bench)
26+
AmulBt_bench = benchmark_AmulBt(sizes); println("A * B' benchmark results:"); println(AmulBt_bench)
27+
AtmulBt_bench = benchmark_AtmulBt(sizes); println("A' * B' benchmark results:"); println(AtmulBt_bench)
28+
AtmulB_bench = benchmark_AtmulB(sizes); println("A' * B benchmark results:"); println(AtmulB_bench)
2929

30-
@show Amulvb_bench = benchmark_Amulvb(sizes);
31-
@show Atmulvb_bench = benchmark_Atmulvb(sizes);
30+
Amulvb_bench = benchmark_Amulvb(sizes); println("A * b benchmark results:"); println(Amulvb_bench)
31+
Atmulvb_bench = benchmark_Atmulvb(sizes); println("A' * b benchmark results:"); println(Atmulvb_bench)
3232

33-
@show filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes);
34-
@show filter2d_3x3_bench = benchmark_filter2d3x3(sizes);
35-
@show filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes);
33+
filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes); println("Benchmark results for dynamically sized 3x3 convolution:"); println(filter2d_dynamic_bench)
34+
filter2d_3x3_bench = benchmark_filter2d3x3(sizes); println("Benchmark results for statically sized 3x3 convolution:"); println(filter2d_3x3_bench)
35+
filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes); println("Benchmark results for unrolled 3x3 convolution:"); println(filter2d_unrolled_bench)
3636

37-
@show dot3_bench = benchmark_dot3(sizes);
38-
@show dot_bench = benchmark_dot(sizes);
39-
@show selfdot_bench = benchmark_selfdot(sizes);
40-
@show sse_bench = benchmark_sse(sizes);
41-
@show aplusBc_bench = benchmark_aplusBc(sizes);
42-
@show AplusAt_bench = benchmark_AplusAt(sizes);
43-
@show vexp_bench = benchmark_exp(sizes);
44-
@show randomaccess_bench = benchmark_random_access(sizes);
45-
@show logdettriangle_bench = benchmark_logdettriangle(sizes);
37+
dot3_bench = benchmark_dot3(sizes); println("x' * A * y benchmark results:"); println(dot3_bench)
38+
dot_bench = benchmark_dot(sizes); println("a' * b benchmark results:"); println(dot_bench)
39+
selfdot_bench = benchmark_selfdot(sizes); println("a' * a benchmark results:"); println(selfdot_bench)
40+
sse_bench = benchmark_sse(sizes); println("Benchmark resutls of summing squared error:"); println(sse_bench)
41+
aplusBc_bench = benchmark_aplusBc(sizes); println("Benchmark results of a .+ B .* c':"); println(aplusBc_bench)
42+
AplusAt_bench = benchmark_AplusAt(sizes); println("Benchmark results of A * A':"); println(AplusAt_bench)
43+
vexp_bench = benchmark_exp(sizes); println("Benchmark results of exponentiating a vector:"); println(vexp_bench)
44+
randomaccess_bench = benchmark_random_access(sizes); println("Benchmark results from using a vector of indices:"); println(randomaccess_bench)
45+
logdettriangle_bench = benchmark_logdettriangle(sizes); println("logdet(LowerTriangular(A)) benchmark results:"); println(logdettriangle_bench)
4646

4747
const v = 1
4848
using Cairo, Fontconfig
4949
const PICTURES = joinpath(pkgdir(LoopVectorization), "docs", "src", "assets")
5050
saveplot(f, br) = draw(PNG(joinpath(PICTURES, f * "$v.png"), 12inch, 8inch), plot(br))
5151

52+
saveplot("bench_logdettriangle_v", logdettriangle_bench);
5253
saveplot("bench_filter2d_dynamic_v", filter2d_dynamic_bench);
5354
saveplot("bench_filter2d_3x3_v", filter2d_3x3_bench);
5455
saveplot("bench_filter2d_unrolled_v", filter2d_unrolled_bench);
@@ -60,7 +61,6 @@ saveplot("bench_aplusBc_v", aplusBc_bench);
6061
saveplot("bench_AplusAt_v", AplusAt_bench);
6162
saveplot("bench_exp_v", vexp_bench);
6263
saveplot("bench_random_access_v", randomaccess_bench);
63-
saveplot("bench_logdettriangle_v", logdettriangle_bench);
6464
saveplot("bench_AmulB_v", AmulB_bench);
6565
saveplot("bench_AmulBt_v", AmulBt_bench);
6666
saveplot("bench_AtmulB_v", AtmulB_bench);

benchmark/loadsharedlibs.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ for (p,s) ∈ [(:c,Cshared) (:e,Eshared)]
238238
@eval function $(Symbol(prefix,p,:dot3))(x, A, y)
239239
M, N = size(A)
240240
ccall(
241-
(:dot3, $s), Float64,
241+
(:dot3v2, $s), Float64,
242242
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong),
243243
x, A, y, M, N
244244
)
@@ -247,7 +247,7 @@ end
247247
@eval function $(Symbol(prefix,:fdot3))(x, A, y)
248248
M, N = size(A)
249249
ccall(
250-
(:dot3, $Fshared), Float64,
250+
(:dot3v2, $Fshared), Float64,
251251
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
252252
x, A, y, Ref(M), Ref(N)
253253
)

benchmark/looptests.jl

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -116,23 +116,21 @@ function jdot3avx(x, A, y)
116116
s
117117
end
118118
function jdot3v2(x, A, y)
119-
M, N = size(A)
120119
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
121-
@inbounds @fastmath for n 1:N
120+
@inbounds @fastmath for n axes(A,2)
122121
t = zero(s)
123-
@simd ivdep for m 1:M
122+
@simd ivdep for m axes(A,1)
124123
t += x[m] * A[m,n]
125124
end
126125
s += t * y[n]
127126
end
128127
s
129128
end
130129
function jdot3v2avx(x, A, y)
131-
M, N = size(A)
132130
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
133-
@avx for n 1:N
131+
@avx for n axes(A,2)
134132
t = zero(s)
135-
for m 1:M
133+
for m axes(A,1)
136134
t += x[m] * A[m,n]
137135
end
138136
s += t * y[n]

src/determinestrategy.jl

Lines changed: 95 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -255,9 +255,29 @@ end
255255
# end
256256
# u₁b, u₂b, cb
257257
# end
258+
259+
function solve_unroll_iter(X, R, u₁L, u₂L, u₁range, u₂range)
260+
R₁, R₂, R₃, R₄, R₅ = R[1], R[2], R[3], R[4], R[5]
261+
RR = REGISTER_COUNT - R₃ - R₄
262+
u₁best, u₂best = 0, 0
263+
bestcost = Inf
264+
for u₁temp u₁range
265+
for u₂temp u₂range
266+
RR u₁temp*u₂temp*R₁ + u₁temp*R₂ + u₂temp*R₅ || continue
267+
tempcost = unroll_cost(X, u₁temp, u₂temp, u₁L, u₂L)
268+
if tempcost < bestcost
269+
bestcost = tempcost
270+
u₁best, u₂best = u₁temp, u₂temp
271+
end
272+
end
273+
end
274+
u₁best, u₂best, bestcost
275+
end
276+
258277
function solve_unroll(X, R, u₁L, u₂L)
259278
X₁, X₂, X₃, X₄ = X[1], X[2], X[3], X[4]
260-
R₁, R₂, R₃, R₄ = R[1], R[2], R[3], R[4]
279+
R₁, R₂, R₃, R₄, R₅ = R[1], R[2], R[3], R[4], R[5]
280+
iszero(R₅) || return solve_unroll_iter(X, R, u₁L, u₂L, 1:10, 1:10)
261281
RR = REGISTER_COUNT - R₃ - R₄
262282
a = R₂^2*X₃ -R₁*X₄ * R₂ - R₁*X₂*RR
263283
b = R₁ * X₄ * RR - R₁ * X₄ * RR - 2X₃*RR*R₂
@@ -272,50 +292,24 @@ function solve_unroll(X, R, u₁L, u₂L)
272292
end
273293
u₁low = floor(Int, u₁float)
274294
u₂low = max(1, floor(Int, u₂float)) # must be at least 1
275-
u₁high = u₁low + 1 #ceil(Int, u₁float)
276-
u₂high = u₂low + 1 #ceil(Int, u₂float)
277-
278-
# RR = REGISTER_COUNT - R[3] - R[4]
279-
u₁, u₂ = u₁low, u₂low
280-
ucost = unroll_cost(X, u₁low, u₂low, u₁L, u₂L)
281-
# @show u₁low*u₂high*R[1] + u₁low*R[2]
282-
if RR u₁low*u₂high*R[1] + u₁low*R[2]
283-
ucost_temp = unroll_cost(X, u₁low, u₂high, u₁L, u₂L)
284-
# @show ucost_temp, ucost
285-
if ucost_temp < ucost
286-
ucost = ucost_temp
287-
u₁, u₂ = u₁low, u₂high
288-
end
289-
end
290-
# The RR + 1 is a hack to get it to favor u₁high in more scenarios
291-
u₂l = u₂low
292-
while RR < u₁high*u₂l*R[1] + u₁high*R[2] && u₂l > 1
293-
u₂l -= 1
294-
end
295-
ucost_temp = unroll_cost(X, u₁high, u₂l, u₁L, u₂L)
296-
if ucost_temp < ucost
297-
ucost = ucost_temp
298-
u₁, u₂ = u₁high, u₂l
299-
end
300-
if RR > u₁high*u₂high*R[1] + u₁high*R[2]
301-
throw("Something went wrong when solving for u₂float and u₁float.")
302-
end
303-
u₁, u₂, ucost
295+
u₁high = solve_unroll_constT(R, u₂low) + 1
296+
u₂high = solve_unroll_constU(R, u₁low) + 1
297+
solve_unroll_iter(X, R, u₁L, u₂L, u₁low:u₁high, u₂low:u₂high)
304298
end
305299

306-
function solve_unroll_constU(X, R, U)
307-
floor(Int, (REGISTER_COUNT - R[3] - R[4] - U*R[2]) / (U * R[1]))
300+
function solve_unroll_constU(R::AbstractVector, u₁::Int)
301+
floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₁*R[2]) / (u₁ * R[1] + R[5]))
308302
end
309-
function solve_unroll_constT(X, R, u₂)
310-
floor(Int, (REGISTER_COUNT - R[3] - R[4]) / (u₂ * R[1] + R[2]))
303+
function solve_unroll_constT(R::AbstractVector, u₂::Int)
304+
floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
311305
end
312-
function solve_unroll_constT(ls, u₂)
306+
function solve_unroll_constT(ls::LoopSet, u₂::Int)
313307
R = @view ls.reg_pres[:,1]
314-
floor(Int, (REGISTER_COUNT - R[3] - R[4]) / (u₂ * R[1] + R[2]))
308+
floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
315309
end
316310
# Tiling here is about alleviating register pressure for the UxT
317311
function solve_unroll(X, R, u₁max, u₂max, u₁L, u₂L)
318-
iszero(first(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, u₁max, u₂max)
312+
# iszero(first(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, u₁max, u₂max)
319313
u₁, u₂, cost = solve_unroll(X, R, u₁L, u₂L)
320314
# u₂ -= u₂ & 1
321315
# u₁ = min(u₁, u₂)
@@ -327,12 +321,12 @@ function solve_unroll(X, R, u₁max, u₂max, u₁L, u₂L)
327321
u₂ = u₂max
328322
else # u₁ too large, resolve u₂
329323
u₁ = u₁max
330-
u₂ = min(u₂max, max(1,solve_unroll_constU(X, R, u₁)))
324+
u₂ = min(u₂max, max(1,solve_unroll_constU(R, u₁)))
331325
end
332326
cost = unroll_cost(X, u₁, u₂, u₁L, u₂L)
333327
elseif u₂_too_large
334328
u₂ = u₂max
335-
u₁ = min(u₁max, max(1,solve_unroll_constT(X, R, u₂)))
329+
u₁ = min(u₁max, max(1,solve_unroll_constT(R, u₂)))
336330
cost = unroll_cost(X, u₁, u₂, u₁L, u₂L)
337331
end
338332
u₁, u₂, cost
@@ -376,14 +370,14 @@ function solve_unroll(
376370
W::Int, vectorized::Symbol,
377371
u₁loop::Loop, u₂loop::Loop
378372
)
379-
maxu₂base = maxu₁base = VectorizationBase.REGISTER_COUNT == 32 ? 6 : 4#8
373+
maxu₂base = maxu₁base = VectorizationBase.REGISTER_COUNT == 32 ? 10 : 6#8
380374
maxu₂ = maxu₂base#8
381375
maxu₁ = maxu₁base#8
382376
u₁L = length(u₁loop)
383377
u₂L = length(u₂loop)
384378
if isstaticloop(u₂loop)
385379
if u₂loopsym !== vectorized && u₂L 4
386-
u₁ = max(1, solve_unroll_constT(cost_vec, reg_pressure, u₂L))
380+
u₁ = max(1, solve_unroll_constT(reg_pressure, u₂L))
387381
u₁ = isstaticloop(u₁loop) ? min(u₁, u₁L) : u₁
388382
return u₁, u₂L, unroll_cost(cost_vec, u₁, u₂L, u₁L, u₂L)
389383
end
@@ -392,7 +386,7 @@ function solve_unroll(
392386
end
393387
if isstaticloop(u₁loop)
394388
if u₁loopsym !== vectorized && u₁L 4
395-
u₂ = max(1, solve_unroll_constU(cost_vec, reg_pressure, u₁L))
389+
u₂ = max(1, solve_unroll_constU(reg_pressure, u₁L))
396390
u₂ = isstaticloop(u₂loop) ? min(u₂, u₂L) : u₂
397391
return u₁L, u₂, unroll_cost(cost_vec, u₁L, u₂, u₁L, u₂L)
398392
end
@@ -525,23 +519,41 @@ function maxnegativeoffset(ls::LoopSet, op::Operation, unrollsyms::UnrollSymbols
525519
end
526520
mno, i
527521
end
528-
function load_elimination_cost_factor(ls::LoopSet, op::Operation, unrollsyms::UnrollSymbols)
529-
@unpack u₁loopsym, u₂loopsym = unrollsyms
522+
function load_elimination_cost_factor!(
523+
cost_vec, reg_pressure, choose_to_inline, ls::LoopSet, op::Operation, iters, unrollsyms::UnrollSymbols, Wshift, size_T
524+
)
525+
@unpack u₁loopsym, u₂loopsym, vectorized = unrollsyms
530526
if !iszero(first(isoptranslation(ls, op, unrollsyms)))
531-
for loop ls.loops
532-
# If another loop is short, assume that LLVM will unroll it, in which case
533-
# we want to be a little more conservative in terms of register pressure.
534-
#FIXME: heuristic hack to get some desired behavior.
535-
if isstaticloop(loop) && length(loop) 4
536-
itersym = loop.itersymbol
537-
if itersym !== u₁loopsym && itersym !== u₂loopsym
538-
return (0.25, VectorizationBase.REGISTER_COUNT == 32 ? 2.0 : 1.0)
539-
end
540-
end
541-
end
542-
(0.25, VectorizationBase.REGISTER_COUNT == 32 ? 1.2 : 1.0)
527+
rt, lat, rp = cost(ls, op, vectorized, Wshift, size_T)
528+
rt *= iters
529+
# rt *= factor1; rp *= factor2;
530+
choose_to_inline[] = true
531+
# for loop ∈ ls.loops
532+
# # If another loop is short, assume that LLVM will unroll it, in which case
533+
# # we want to be a little more conservative in terms of register pressure.
534+
# #FIXME: heuristic hack to get some desired behavior.
535+
# if isstaticloop(loop) && length(loop) ≤ 4
536+
# itersym = loop.itersymbol
537+
# if itersym !== u₁loopsym && itersym !== u₂loopsym
538+
# return (0.25, VectorizationBase.REGISTER_COUNT == 32 ? 2.0 : 1.0)
539+
# # return (0.25, 1.0)
540+
# return true
541+
# end
542+
# end
543+
# end
544+
# # (0.25, VectorizationBase.REGISTER_COUNT == 32 ? 1.2 : 1.0)
545+
# (0.25, 1.0)
546+
cost_vec[1] += 0.1rt
547+
reg_pressure[1] += 0.51rp
548+
cost_vec[2] += rt
549+
reg_pressure[2] += rp
550+
cost_vec[3] += rt
551+
# reg_pressure[3] += rp
552+
reg_pressure[5] += rp
553+
true
543554
else
544555
(1.0, 1.0)
556+
false
545557
end
546558
end
547559
function add_constant_offset_load_elmination_cost!(
@@ -624,6 +636,7 @@ function evaluate_cost_tile(
624636
iter::Int = 1
625637
u₁reached = u₂reached = false
626638
choose_to_inline = Ref(false)
639+
copyto!(names(ls), order); reverse!(names(ls))
627640
for n 1:N
628641
itersym = order[n]
629642
if itersym == u₁loopsym
@@ -664,30 +677,33 @@ function evaluate_cost_tile(
664677

665678
u₁reduces, u₂reduces = reduced_by_unrolling[1,id], reduced_by_unrolling[2,id]
666679
# @show op, u₁reduces, u₂reduces
667-
if !(isload(op) && add_constant_offset_load_elmination_cost!(cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, u₁reduces, u₂reduces, Wshift, size_T, opisininnerloop))
668-
rt, lat, rp = cost(ls, op, vectorized, Wshift, size_T)
680+
if isload(op)
681+
if add_constant_offset_load_elmination_cost!(cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, u₁reduces, u₂reduces, Wshift, size_T, opisininnerloop)
682+
continue
683+
elseif load_elimination_cost_factor!(cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, Wshift, size_T)
684+
continue
685+
end
686+
end
669687
# @show op rt, lat, rp
670-
if isload(op)
671-
factor1, factor2 = load_elimination_cost_factor(ls, op, unrollsyms)
672-
rt *= factor1; rp *= factor2;
673-
choose_to_inline[] |= factor1 < 1
674-
end
675-
# @show isunrolled₁, isunrolled₂, op rt, lat, rp
676-
rp = opisininnerloop ? rp : zero(rp) # we only care about register pressure within the inner most loop
677-
rt *= iters[id]
678-
if u₁reduces & u₂reduces
679-
cost_vec[4] += rt
680-
reg_pressure[4] += rp
681-
elseif u₂reduces # cost decreased by unrolling u₂loop
682-
cost_vec[2] += rt
683-
reg_pressure[2] += rp
684-
elseif u₁reduces # cost decreased by unrolling u₁loop
685-
cost_vec[3] += rt
686-
reg_pressure[3] += rp
687-
else # no cost decrease; cost must be repeated
688-
cost_vec[1] += rt
689-
reg_pressure[1] += rp
690-
end
688+
rt, lat, rp = cost(ls, op, vectorized, Wshift, size_T)
689+
if isload(op) && !iszero(prefetchisagoodidea(ls, op, UnrollArgs(4, unrollsyms, 4, 0)))
690+
rt += 0.5VectorizationBase.REGISTER_SIZE / VectorizationBase.CACHELINE_SIZE
691+
end
692+
# @show isunrolled₁, isunrolled₂, op rt, lat, rp
693+
rp = opisininnerloop ? rp : zero(rp) # we only care about register pressure within the inner most loop
694+
rt *= iters[id]
695+
if u₁reduces & u₂reduces
696+
cost_vec[4] += rt
697+
reg_pressure[4] += rp
698+
elseif u₂reduces # cost decreased by unrolling u₂loop
699+
cost_vec[2] += rt
700+
reg_pressure[2] += rp
701+
elseif u₁reduces # cost decreased by unrolling u₁loop
702+
cost_vec[3] += rt
703+
reg_pressure[3] += rp
704+
else # no cost decrease; cost must be repeated
705+
cost_vec[1] += rt
706+
reg_pressure[1] += rp
691707
end
692708
end
693709
# @show cost_vec reg_pressure

0 commit comments

Comments
 (0)