Bug fix for large unrolls.

chriselrod · chriselrod · commit 06d929454700 · 2020-01-01T01:01:48.000-05:00
diff --git a/benchmarks/driver.jl b/benchmarks/driver.jl
@@ -30,6 +30,29 @@ Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v
 function Base.show(io::IO, br::BenchmarkResult)
     pretty_table(io, br.sizedresults, br.tests)
 end
+
+using VegaLite, IndexedTables
+function plot(br::BenchmarkResult)
+    res = vec(br.sizedresults.results)
+    brsizes = br.sizedresults.sizes
+    sizes = Vector{eltype(brsizes)}(undef, length(res))
+    ntests = length(br.tests) - 1
+    for i ∈ 0:length(brsizes)-1
+        si = brsizes[i+1]
+        for j ∈ 1:ntests
+            sizes[j + i*ntests] = si
+        end
+    end
+    tests = vcat((@view(br.tests[2:end]) for _ ∈ eachindex(brsizes))...)
+    t = table((GFLOPS = res, Size = sizes, Method = tests))
+    t |> @vlplot(
+        :line,
+        x = :Size,
+        y = :GFLOPS,
+        color = :Method
+    )
+end
+
 function alloc_matrices(s::NTuple{3,Int})
     M, K, N = s
     C = Matrix{Float64}(undef, M, N)
@@ -38,8 +61,8 @@ function alloc_matrices(s::NTuple{3,Int})
     C, A, B
 end
 alloc_matrices(s::Int) = alloc_matrices((s,s,s))
-gflop(s::Int) = s^3 * 1e-9
-gflop(s::NTuple{3,Int}) = prod(s) * 1e-9
+gflop(s::Int) = s^3 * 2e-9
+gflop(s::NTuple{3,Int}) = prod(s) * 2e-9
 function benchmark_gemm(sizes)
     tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "GFort-intrinsic", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
@@ -61,27 +84,108 @@ function benchmark_gemm(sizes)
     end
     br
 end
+function benchmark_dot(sizes)
+    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
+    br = BenchmarkResult(tests, sizes)
+    for (i,s) ∈ enumerate(sizes)
+        a = rand(s); b = rand(s);
+        n_gflop = s * 2e-9
+        br[1,i] = n_gflop / @belapsed dot($a, $b)
+        dotblas = dot(a, b)
+        br[2,i] = n_gflop / @belapsed jdot($a, $b)
+        @assert jdot(a,b) ≈ dotblas "Julia dot wrong?"
+        br[3,i] = n_gflop / @belapsed cdot($a, $b)
+        @assert cdot(a,b) ≈ dotblas "Polly dot wrong?"
+        br[4,i] = n_gflop / @belapsed fdot($a, $b)
+        @assert fdot(a,b) ≈ dotblas "Fort dot wrong?"
+        br[5,i] = n_gflop / @belapsed jdotavx($a, $b)
+        @assert jdotavx(a,b) ≈ dotblas "LoopVec dot wrong?"
+    end
+    br
+end
+function benchmark_selfdot(sizes)
+    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
+    br = BenchmarkResult(tests, sizes)
+    for (i,s) ∈ enumerate(sizes)
+        a = rand(s);
+        n_gflop = s * 2e-9
+        br[1,i] = n_gflop / @belapsed dot($a, $a)
+        dotblas = dot(a, a)
+        br[2,i] = n_gflop / @belapsed jselfdot($a)
+        @assert jselfdot(a) ≈ dotblas "Julia dot wrong?"
+        br[3,i] = n_gflop / @belapsed cselfdot($a)
+        @assert cselfdot(a) ≈ dotblas "Polly dot wrong?"
+        br[4,i] = n_gflop / @belapsed fselfdot($a)
+        @assert fselfdot(a) ≈ dotblas "Fort dot wrong?"
+        br[5,i] = n_gflop / @belapsed jselfdotavx($a)
+        @assert jselfdotavx(a) ≈ dotblas "LoopVec dot wrong?"
+    end
+    br
+end
+totwotuple(i::Int) = (i,i)
+totwotuple(i::Tuple{Int,Int}) = i
+function sse!(Xβ, y, X, β)
+    mul!(copyto!(Xβ, y), X, β, 1.0, -1.0)
+    dot(Xβ, Xβ)
+end
+function benchmark_sse(sizes)
+    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "GFort-intrinsic", "LoopVectorization"]
+    br = BenchmarkResult(tests, sizes)
+    for (i,s) ∈ enumerate(sizes)
+        N, P = totwotuple(s)
+        y = rand(N); β = rand(P)
+        X = randn(N, P)
+        Xβ = similar(y)
+        n_gflop = 2e-9*(P*N + 2N)
+        br[1,i] = n_gflop / @belapsed sse!($Xβ, $y, $X, $β)
+        lpblas = sse!(Xβ, y, X, β)
+        br[2,i] = n_gflop / @belapsed jOLSlp($y, $X, $β)
+        @assert jOLSlp(y, X, β) ≈ lpblas "Julia wrong?"
+        br[3,i] = n_gflop / @belapsed cOLSlp($y, $X, $β)
+        @assert cOLSlp(y, X, β) ≈ lpblas "Polly wrong?"
+        br[4,i] = n_gflop / @belapsed fOLSlp($y, $X, $β)
+        @assert fOLSlp(y, X, β) ≈ lpblas "Fort wrong?"
+        br[5,i] = n_gflop / @belapsed jOLSlp_avx($y, $X, $β)
+        @assert jOLSlp_avx(y, X, β) ≈ lpblas "LoopVec wrong?"
+    end
+    br
+end
 
-using VegaLite, IndexedTables
-function plot(br::BenchmarkResult)
-    res = vec(br.sizedresults.results)
-    brsizes = br.sizedresults.sizes
-    sizes = Vector{eltype(brsizes)}(undef, length(res))
-    ntests = length(br.tests) - 1
-    for i ∈ 0:length(brsizes)-1
-        si = brsizes[i+1]
-        for j ∈ 1:ntests
-            sizes[j + i*ntests] = si
-        end
+function benchmark_exp(sizes)
+    tests = ["Julia", "GFort-loops", "LoopVectorization"]
+    br = BenchmarkResult(tests, sizes)
+    for (i,s) ∈ enumerate(sizes)
+        a = rand(s); b = similar(a)
+        n_gflop = s # not really gflops
+        br[1,i] = n_gflop / @belapsed @. $b = exp($a)
+        baseb = copy(b)
+        br[2,i] = n_gflop / @belapsed fvexp!($b, $a)
+        @assert b ≈ baseb "Fort wrong?"
+        br[3,i] = n_gflop / @belapsed @avx @. $b = exp($a)
+        @assert b ≈ baseb "LoopVec wrong?"
     end
-    tests = vcat((@view(br.tests[2:end]) for _ ∈ eachindex(brsizes))...)
-    t = table((GFLOPS = res, Size = sizes, Method = tests))
-    t |> @vlplot(
-        :line,
-        x = :Size,
-        y = :GFLOPS,
-        color = :Method
-    )
+    br
+end
+
+function benchmark_aplusBc(sizes)
+    tests = ["Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
+    br = BenchmarkResult(tests, sizes)
+    for (i,s) ∈ enumerate(sizes)
+        M, N = totwotuple(s)
+        a = rand(M); B = rand(M,N); c = rand(N);
+        c′ = c'; D = similar(B)
+        n_gflop = 2e-9 * M*N
+        br[1,i] = n_gflop / @belapsed @. $D = $a + $B * $c′
+        Dcopy = copy(D)
+        br[2,i] = n_gflop / @belapsed caplusBc!($D, $a, $B, $c)
+        @assert D ≈ Dcopy "Polly wrong?"
+        br[3,i] = n_gflop / @belapsed faplusBc!($D, $a, $B, $c)
+        @assert D ≈ Dcopy "Fort wrong?"
+        br[4,i] = n_gflop / @belapsed @avx @. $D = $a + $B * $c′
+        @assert D ≈ Dcopy "LoopVec wrong?"
+    end
+    br
 end
 
 
+
diff --git a/benchmarks/loadsharedlibs.jl b/benchmarks/loadsharedlibs.jl
@@ -1,6 +1,5 @@
 
-using VectorizationBase: REGISTER_SIZE
-# run(`gfortran `)
+using LoopVectorization.VectorizationBase: REGISTER_SIZE
 
 pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
 const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmarks")
@@ -144,5 +143,22 @@ function fOLSlp(y, X, β)
     )
     lp[]
 end
-
+function fvexp!(b, a)
+    N = length(b)
+    ccall(
+        (:vexp, LIBFTEST), Cvoid,
+        (Ptr{Float64}, Ptr{Float64}, Ref{Clong}),
+        b, a, Ref(N)
+    )
+end
+function fvexpsum(a)
+    N = length(a)
+    s = Ref{Float64}()
+    ccall(
+        (:svexp, LIBFTEST), Cvoid,
+        (Ref{Float64}, Ptr{Float64}, Ref{Clong}),
+        s, a, Ref(N)
+    )
+    s[]
+end
 
diff --git a/benchmarks/looptests.f90 b/benchmarks/looptests.f90
@@ -123,6 +123,7 @@ subroutine selfdot(s, a, N) BIND(C, name="selfdot")
          s = s + a(i) * a(i)
       end do
     end subroutine selfdot
+    !GCC$ builtin (exp) attributes simd (notinbranch) if('x86_64')
     subroutine vexp(b, a, N) BIND(C, name="vexp")
       integer(C_long), intent(in) :: N
       real(C_double), dimension(N), intent(in) :: a
diff --git a/src/lowering.jl b/src/lowering.jl
@@ -629,7 +629,7 @@ function lower_unrolled_dynamic!(
     manageouterreductions = T == -1 && length(ls.outer_reductions) > 0
     if manageouterreductions
         # Umax = (!static_unroll && U > 2) ? U >> 1 : U
-        Ureduct = min(U, 4)
+        Ureduct = U > 6 ? 4 : U
         initialize_outer_reductions!(q, ls, 0, Ureduct, W, last(names(ls)))
     else
         Ureduct = -1
@@ -653,9 +653,14 @@ function lower_unrolled_dynamic!(
             push!(remblock.args, remblocknew)
             remblock = remblocknew
         end
-        if Ut == U
+        if Ut == U || Ut == Ureduct
             firstiter || break
             firstiter = false
+            if manageouterreductions && Ureduct < U
+                Udiff = U - Ureduct
+                loopq = lower_set(ls, Udiff, T, Wt, nothing, :if)
+                push!(q.args, loopq)
+            end
             Ut = 1
             # setup for branchy remainder calculation
             comparison = Expr(:call, :(!=), unrolled_numitersym, unrolled)