Skip to content

Commit 06d9294

Browse files
committed
Bug fix for large unrolls.
1 parent 090c173 commit 06d9294

File tree

4 files changed

+152
-26
lines changed

4 files changed

+152
-26
lines changed

benchmarks/driver.jl

Lines changed: 125 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,29 @@ Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v
3030
function Base.show(io::IO, br::BenchmarkResult)
3131
pretty_table(io, br.sizedresults, br.tests)
3232
end
33+
34+
using VegaLite, IndexedTables
35+
function plot(br::BenchmarkResult)
36+
res = vec(br.sizedresults.results)
37+
brsizes = br.sizedresults.sizes
38+
sizes = Vector{eltype(brsizes)}(undef, length(res))
39+
ntests = length(br.tests) - 1
40+
for i 0:length(brsizes)-1
41+
si = brsizes[i+1]
42+
for j 1:ntests
43+
sizes[j + i*ntests] = si
44+
end
45+
end
46+
tests = vcat((@view(br.tests[2:end]) for _ eachindex(brsizes))...)
47+
t = table((GFLOPS = res, Size = sizes, Method = tests))
48+
t |> @vlplot(
49+
:line,
50+
x = :Size,
51+
y = :GFLOPS,
52+
color = :Method
53+
)
54+
end
55+
3356
function alloc_matrices(s::NTuple{3,Int})
3457
M, K, N = s
3558
C = Matrix{Float64}(undef, M, N)
@@ -38,8 +61,8 @@ function alloc_matrices(s::NTuple{3,Int})
3861
C, A, B
3962
end
4063
alloc_matrices(s::Int) = alloc_matrices((s,s,s))
41-
gflop(s::Int) = s^3 * 1e-9
42-
gflop(s::NTuple{3,Int}) = prod(s) * 1e-9
64+
gflop(s::Int) = s^3 * 2e-9
65+
gflop(s::NTuple{3,Int}) = prod(s) * 2e-9
4366
function benchmark_gemm(sizes)
4467
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "GFort-intrinsic", "LoopVectorization"]
4568
br = BenchmarkResult(tests, sizes)
@@ -61,27 +84,108 @@ function benchmark_gemm(sizes)
6184
end
6285
br
6386
end
87+
function benchmark_dot(sizes)
88+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
89+
br = BenchmarkResult(tests, sizes)
90+
for (i,s) enumerate(sizes)
91+
a = rand(s); b = rand(s);
92+
n_gflop = s * 2e-9
93+
br[1,i] = n_gflop / @belapsed dot($a, $b)
94+
dotblas = dot(a, b)
95+
br[2,i] = n_gflop / @belapsed jdot($a, $b)
96+
@assert jdot(a,b) dotblas "Julia dot wrong?"
97+
br[3,i] = n_gflop / @belapsed cdot($a, $b)
98+
@assert cdot(a,b) dotblas "Polly dot wrong?"
99+
br[4,i] = n_gflop / @belapsed fdot($a, $b)
100+
@assert fdot(a,b) dotblas "Fort dot wrong?"
101+
br[5,i] = n_gflop / @belapsed jdotavx($a, $b)
102+
@assert jdotavx(a,b) dotblas "LoopVec dot wrong?"
103+
end
104+
br
105+
end
106+
function benchmark_selfdot(sizes)
107+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
108+
br = BenchmarkResult(tests, sizes)
109+
for (i,s) enumerate(sizes)
110+
a = rand(s);
111+
n_gflop = s * 2e-9
112+
br[1,i] = n_gflop / @belapsed dot($a, $a)
113+
dotblas = dot(a, a)
114+
br[2,i] = n_gflop / @belapsed jselfdot($a)
115+
@assert jselfdot(a) dotblas "Julia dot wrong?"
116+
br[3,i] = n_gflop / @belapsed cselfdot($a)
117+
@assert cselfdot(a) dotblas "Polly dot wrong?"
118+
br[4,i] = n_gflop / @belapsed fselfdot($a)
119+
@assert fselfdot(a) dotblas "Fort dot wrong?"
120+
br[5,i] = n_gflop / @belapsed jselfdotavx($a)
121+
@assert jselfdotavx(a) dotblas "LoopVec dot wrong?"
122+
end
123+
br
124+
end
125+
totwotuple(i::Int) = (i,i)
126+
totwotuple(i::Tuple{Int,Int}) = i
127+
function sse!(Xβ, y, X, β)
128+
mul!(copyto!(Xβ, y), X, β, 1.0, -1.0)
129+
dot(Xβ, Xβ)
130+
end
131+
function benchmark_sse(sizes)
132+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "GFort-intrinsic", "LoopVectorization"]
133+
br = BenchmarkResult(tests, sizes)
134+
for (i,s) enumerate(sizes)
135+
N, P = totwotuple(s)
136+
y = rand(N); β = rand(P)
137+
X = randn(N, P)
138+
= similar(y)
139+
n_gflop = 2e-9*(P*N + 2N)
140+
br[1,i] = n_gflop / @belapsed sse!($Xβ, $y, $X, $β)
141+
lpblas = sse!(Xβ, y, X, β)
142+
br[2,i] = n_gflop / @belapsed jOLSlp($y, $X, $β)
143+
@assert jOLSlp(y, X, β) lpblas "Julia wrong?"
144+
br[3,i] = n_gflop / @belapsed cOLSlp($y, $X, $β)
145+
@assert cOLSlp(y, X, β) lpblas "Polly wrong?"
146+
br[4,i] = n_gflop / @belapsed fOLSlp($y, $X, $β)
147+
@assert fOLSlp(y, X, β) lpblas "Fort wrong?"
148+
br[5,i] = n_gflop / @belapsed jOLSlp_avx($y, $X, $β)
149+
@assert jOLSlp_avx(y, X, β) lpblas "LoopVec wrong?"
150+
end
151+
br
152+
end
64153

65-
using VegaLite, IndexedTables
66-
function plot(br::BenchmarkResult)
67-
res = vec(br.sizedresults.results)
68-
brsizes = br.sizedresults.sizes
69-
sizes = Vector{eltype(brsizes)}(undef, length(res))
70-
ntests = length(br.tests) - 1
71-
for i 0:length(brsizes)-1
72-
si = brsizes[i+1]
73-
for j 1:ntests
74-
sizes[j + i*ntests] = si
75-
end
154+
function benchmark_exp(sizes)
155+
tests = ["Julia", "GFort-loops", "LoopVectorization"]
156+
br = BenchmarkResult(tests, sizes)
157+
for (i,s) enumerate(sizes)
158+
a = rand(s); b = similar(a)
159+
n_gflop = s # not really gflops
160+
br[1,i] = n_gflop / @belapsed @. $b = exp($a)
161+
baseb = copy(b)
162+
br[2,i] = n_gflop / @belapsed fvexp!($b, $a)
163+
@assert b baseb "Fort wrong?"
164+
br[3,i] = n_gflop / @belapsed @avx @. $b = exp($a)
165+
@assert b baseb "LoopVec wrong?"
76166
end
77-
tests = vcat((@view(br.tests[2:end]) for _ eachindex(brsizes))...)
78-
t = table((GFLOPS = res, Size = sizes, Method = tests))
79-
t |> @vlplot(
80-
:line,
81-
x = :Size,
82-
y = :GFLOPS,
83-
color = :Method
84-
)
167+
br
168+
end
169+
170+
function benchmark_aplusBc(sizes)
171+
tests = ["Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
172+
br = BenchmarkResult(tests, sizes)
173+
for (i,s) enumerate(sizes)
174+
M, N = totwotuple(s)
175+
a = rand(M); B = rand(M,N); c = rand(N);
176+
c′ = c'; D = similar(B)
177+
n_gflop = 2e-9 * M*N
178+
br[1,i] = n_gflop / @belapsed @. $D = $a + $B * $c′
179+
Dcopy = copy(D)
180+
br[2,i] = n_gflop / @belapsed caplusBc!($D, $a, $B, $c)
181+
@assert D Dcopy "Polly wrong?"
182+
br[3,i] = n_gflop / @belapsed faplusBc!($D, $a, $B, $c)
183+
@assert D Dcopy "Fort wrong?"
184+
br[4,i] = n_gflop / @belapsed @avx @. $D = $a + $B * $c′
185+
@assert D Dcopy "LoopVec wrong?"
186+
end
187+
br
85188
end
86189

87190

191+

benchmarks/loadsharedlibs.jl

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11

2-
using VectorizationBase: REGISTER_SIZE
3-
# run(`gfortran `)
2+
using LoopVectorization.VectorizationBase: REGISTER_SIZE
43

54
pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
65
const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmarks")
@@ -144,5 +143,22 @@ function fOLSlp(y, X, β)
144143
)
145144
lp[]
146145
end
147-
146+
function fvexp!(b, a)
147+
N = length(b)
148+
ccall(
149+
(:vexp, LIBFTEST), Cvoid,
150+
(Ptr{Float64}, Ptr{Float64}, Ref{Clong}),
151+
b, a, Ref(N)
152+
)
153+
end
154+
function fvexpsum(a)
155+
N = length(a)
156+
s = Ref{Float64}()
157+
ccall(
158+
(:svexp, LIBFTEST), Cvoid,
159+
(Ref{Float64}, Ptr{Float64}, Ref{Clong}),
160+
s, a, Ref(N)
161+
)
162+
s[]
163+
end
148164

benchmarks/looptests.f90

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ subroutine selfdot(s, a, N) BIND(C, name="selfdot")
123123
s = s + a(i) * a(i)
124124
end do
125125
end subroutine selfdot
126+
!GCC$ builtin (exp) attributes simd (notinbranch) if('x86_64')
126127
subroutine vexp(b, a, N) BIND(C, name="vexp")
127128
integer(C_long), intent(in) :: N
128129
real(C_double), dimension(N), intent(in) :: a

src/lowering.jl

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -629,7 +629,7 @@ function lower_unrolled_dynamic!(
629629
manageouterreductions = T == -1 && length(ls.outer_reductions) > 0
630630
if manageouterreductions
631631
# Umax = (!static_unroll && U > 2) ? U >> 1 : U
632-
Ureduct = min(U, 4)
632+
Ureduct = U > 6 ? 4 : U
633633
initialize_outer_reductions!(q, ls, 0, Ureduct, W, last(names(ls)))
634634
else
635635
Ureduct = -1
@@ -653,9 +653,14 @@ function lower_unrolled_dynamic!(
653653
push!(remblock.args, remblocknew)
654654
remblock = remblocknew
655655
end
656-
if Ut == U
656+
if Ut == U || Ut == Ureduct
657657
firstiter || break
658658
firstiter = false
659+
if manageouterreductions && Ureduct < U
660+
Udiff = U - Ureduct
661+
loopq = lower_set(ls, Udiff, T, Wt, nothing, :if)
662+
push!(q.args, loopq)
663+
end
659664
Ut = 1
660665
# setup for branchy remainder calculation
661666
comparison = Expr(:call, :(!=), unrolled_numitersym, unrolled)

0 commit comments

Comments
 (0)