Minor doc updates and update non-AVX512 tests.

chriselrod · chriselrod · commit 053c8d2e1e5e · 2020-05-07T14:30:51.000-04:00
diff --git a/benchmark/looptests.jl b/benchmark/looptests.jl
@@ -259,17 +259,17 @@ function randomaccessavx(P, basis, coeffs::Vector{T}) where {T}
     end
     return p
 end
-function jlogdettriangle(T::Union{LowerTriangular,UpperTriangular})
+function jlogdettriangle(B::Union{LowerTriangular,UpperTriangular})
     ld = 0.0
-    @inbounds for n ∈ 1:size(T,1)
-        ld += log(T[n,n])
+    @inbounds @fastmath for n ∈ 1:size(B,1)
+        ld += log(B[n,n])
     end
     ld
 end
-function jlogdettriangleavx(T::Union{LowerTriangular,UpperTriangular})
-    A = parent(T) # No longer supported
+function jlogdettriangleavx(B::Union{LowerTriangular,UpperTriangular})
+    A = parent(B) # No longer supported
     ld = zero(eltype(A))
-    @avx for n ∈ 1:size(T,1)
+    @avx for n ∈ axes(A,1)
         ld += log(A[n,n])
     end
     ld
diff --git a/docs/make.jl b/docs/make.jl
@@ -10,6 +10,7 @@ makedocs(;
             "examples/matrix_multiplication.md",
             "examples/matrix_vector_ops.md",
             "examples/dot_product.md",
+            "examples/special_functions.md",
             "examples/sum_of_squared_error.md",
             "examples/filtering.md"
         ],
diff --git a/docs/src/assets/bench_logdettriangle_v1.svg b/docs/src/assets/bench_logdettriangle_v1.svg
diff --git a/docs/src/examples/matrix_vector_ops.md b/docs/src/examples/matrix_vector_ops.md
@@ -3,7 +3,7 @@
 Here I'll discuss a variety of Matrix-vector operations, naturally starting with matrix-vector multiplication.
 
 ```julia
-@inline function jgemvavx!(𝐲, 𝐀, 𝐱)
+function jgemvavx!(𝐲, 𝐀, 𝐱)
     @avx for i ∈ eachindex(𝐲)
         𝐲ᵢ = zero(eltype(𝐲))
         for j ∈ eachindex(𝐱)
diff --git a/docs/src/examples/special_functions.md b/docs/src/examples/special_functions.md
@@ -0,0 +1,27 @@
+# Special Functions
+
+`LoopVectorization` supports vectorizing many special functions, for example, to calculate the log determinant of a triangular matrix:
+```julia
+function logdettriangle(B::Union{LowerTriangular,UpperTriangular})
+    A = parent(B) # using a triangular matrix would fall back to the default loop.
+    ld = zero(eltype(A))
+    @avx for n ∈ axes(A,1)
+        ld += log(A[n,n])
+    end
+    ld
+end
+```
+![selfdot](../assets/bench_logdettriangle_v1.svg)
+
+While Intel's proprietary compilers do the best, LoopVectorization performs very well among open source alternatives. A complicating
+factor to the above benchmark is that in accessing the diagonals, we are not accessing contiguous elements. A benchmark
+simply exponentiating a vector shows that `gcc` also has efficient special function vectorization, but that the autovectorizer
+disagrees with the discontiguous memory acesses:
+
+![selfdot](../assets/bench_exp_v1.svg)
+
+The similar performance between `gfortran` and `LoopVectorization` at multiples of 8 is no fluke: on Linux systems with a recent GLIBC, SLEEFPirates.jl --
+which LoopVectorization depends on to vectorize these special functions -- looks for the GNU vector library and uses these functions
+if available. Otherwise, it will use native Julia implementations that tend to be slower. As the modulus of vector length and vector width (8, on the
+host system thanks to AVX512) increases, `gfortran` shows the performance degredation pattern typical of LLVM-vectorized code.
+
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -12,6 +12,7 @@ Pages = [
     "examples/matrix_vector_ops.md",
     "examples/dot_product.md",
     "examples/filtering.md",
+    "examples/special_functions.md",
     "examples/sum_of_squared_error.md",
     "vectorized_convenience_functions.md",
     "future_work.md",
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -202,46 +202,6 @@ end
 @inline array_wrapper(A::Adjoint) = Adjoint
 @inline array_wrapper(A::SubArray) = A.indices
 
-
-# If you change the number of arguments here, make commensurate changes
-# to the `insert!` locations in `setup_call_noinline`.
-@generated function __avx__!(
-    ::Val{UNROLL}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB,
-    ::Val{AR}, ::Val{D}, ::Val{IND}, subsetvals, arraydescript, vargs::Vararg{<:Any,N}
-) where {UNROLL, OPS, ARF, AM, LPSYM, LB, N, AR, D, IND}
-    1 + 1
-    num_vptrs = length(ARF.parameters)::Int
-    vptrs = [gensym(:vptr) for _ ∈ 1:num_vptrs]
-    call = Expr(:call, lv(:_avx_!), Val{UNROLL}(), OPS, ARF, AM, LPSYM, :lb)
-    for n ∈ 1:num_vptrs
-        push!(call.args, vptrs[n])
-    end
-    q = Expr(:block)
-    j = 0
-    assigned_names = Vector{Symbol}(undef, length(AR))
-    num_arrays = 0
-    for i ∈ eachindex(AR)
-        ari = (AR[i])::Int
-        ind = (IND[i])::Union{Nothing,Int}
-        LHS = ind === nothing ? gensym() : vptrs[ind]
-        assigned_names[i] = LHS
-        d = (D[i])::Union{Nothing,Int}
-        if d === nothing
-            num_arrays += 1
-            RHS = Expr(:call, lv(:stridedpointer), Expr(:ref, :vargs, ari), Expr(:ref, :arraydescript, ari))
-        else #subsetview
-            j += 1
-            RHS = Expr(:call, :subsetview, assigned_names[ari], Expr(:call, Expr(:curly, :Val, d)), Expr(:ref, :subsetvals, j))
-        end
-        push!(q.args, Expr(:(=), LHS, RHS))
-    end
-    for n ∈ num_arrays+1:N
-        push!(call.args, Expr(:ref, :vargs, n))
-    end
-    push!(q.args, call)
-    Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), q)
-end
-
 # Try to condense in type stable manner
 function generate_call(ls::LoopSet, inline_unroll::NTuple{3,Int8}, debug::Bool = false)
     operation_descriptions = Expr(:curly, :Tuple)
@@ -330,6 +290,9 @@ function check_args_call(ls::LoopSet)
     q
 end
 
+make_fast(q) = Expr(:macrocall, Symbol("@fastmath"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), q)
+make_crashy(q) = Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), q)
+make_fast_and_crashy(q) = q |> make_fast |> make_crashy
 
 function setup_call_inline(ls::LoopSet, inline::Int8 = zero(Int8), U::Int8 = zero(Int8), T::Int8 = zero(Int8))
     call = generate_call(ls, (inline,U,T))
@@ -369,5 +332,5 @@ function setup_call(ls::LoopSet, q = nothing, inline::Int8 = zero(Int8), u₁::I
     # inlining the generated function into the loop preamble.
     call = setup_call_inline(ls, inline, u₁, u₂)
     isnothing(q) && return Expr(:block, ls.prepreamble, call)
-    Expr(:block, ls.prepreamble, Expr(:if, check_args_call(ls), call, q))
+    Expr(:block, ls.prepreamble, Expr(:if, check_args_call(ls), call, make_fast_and_crashy(q)))
 end
diff --git a/src/constructors.jl b/src/constructors.jl
@@ -101,14 +101,16 @@ using keyword arguments:
 
 where `body` is the code of the block (e.g., `for ... end`).
 
-`inline` is a Boolean. When `true` (the default), `body` will be directly inlined
+`inline` is a Boolean. When `true`, `body` will be directly inlined
 into the function (via a forced-inlining call to `_avx_!`).
-When `false`, it will call `__avx__!` instead, letting Julia's own inlining engine
-determine whether the call to `__avx__!` should be inlined. (Typically, it won't.)
-In priniciple, first calling `__avx__!` (which itself calls `_avx_!`) can sometimes
-allow better code generation.
+When `false`, it wont force inlining of the call to `_avx_!` instead, letting Julia's own inlining engine
+determine whether the call to `_avx_!` should be inlined. (Typically, it won't.)
+Sometimes not inlining can lead to substantially worse code generation, and >40% regressions, even in very
+large problems (2-d convolutions are a case where this has been observed).
 One can find some circumstances where `inline=true` is faster, and other circumstances
-where `inline=false` is faster, so the best setting may require experimentation.
+where `inline=false` is faster, so the best setting may require experimentation. By default, the macro
+tries to guess. Currently the algorithm is simple: roughly, if there are more than two dynamically sized loops
+or and no convolutions, it will probably not force inlining. Otherwise, it probably will.
 
 `unroll` is an integer that specifies the loop unrolling factor, or a
 tuple `(u₁, u₂) = (4, 2)` signaling that the generated code should unroll more than
@@ -117,6 +119,13 @@ but it applies to the loop ordering and unrolling that will be chosen by LoopVec
 *not* the order in `body`.
 `uᵢ=0` (the default) indicates that LoopVectorization should pick its own value,
 and `uᵢ=-1` disables unrolling for the correspond loop.
+
+The `@avx` macro also checks the array arguments using `LoopVectorization.check_args` to try and determine
+if they are compatible with the macro. If `check_args` returns false, a fall back loop annotated with `@inbounds`
+and `@fastmath` is generated. Note that `SIMDPirates` provides functions such as `evadd` and `evmul` that will
+ignore `@fastmath`, preserving IEEE semantics both within `@avx` and `@fastmath`.
+`check_args` currently returns false for some wrapper types like `LinearAlgebra.UpperTriangular`, requiring you to
+use their `parent`. Triangular loops aren't yet supported.
 """
 macro avx(q)
     q = macroexpand(__module__, q)
diff --git a/test/fallback.jl b/test/fallback.jl
@@ -12,7 +12,7 @@
     function msdavx(x)
         s = zero(eltype(x))
         @avx for i in eachindex(x)
-            s += x[i] * x[i]
+            s = muladd(x[i], x[i], s) # Avoids fastmath in fallback loop.
         end
         s
     end
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
@@ -72,12 +72,13 @@ using Test
                 B[j,i] = A[j,i] - x[j]
                 end)
     lssubcol = LoopVectorization.LoopSet(subcolq);
-    if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
-        @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, 4, 6)
-    else
-        # @test LoopVectorization.choose_order(lssubcol) == ([:j, :i], :j, :i, :j, 3, 4)#&-2
-        @test LoopVectorization.choose_order(lssubcol) == ([:j, :i], :i, :j, :j, 4, 4)#&-2
-    end
+    @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, U, T)
+    # if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
+    #     @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, 4, 6)
+    # else
+    #     # @test LoopVectorization.choose_order(lssubcol) == ([:j, :i], :j, :i, :j, 3, 4)#&-2
+    #     @test LoopVectorization.choose_order(lssubcol) == ([:j, :i], :i, :j, :j, 4, 4)#&-2
+    # end
     ## @avx is SLOWER!!!!
     ## need to fix!
     function mysubcol!(B, A, x)
@@ -102,12 +103,13 @@ using Test
                 x[j] += A[j,i] - 0.25
                 end)
     lscolsum = LoopVectorization.LoopSet(colsumq);
-    if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
-        @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 4, 6)
-    else
-        # @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 3, 4)
-        @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :i, :j, :j, 4, 4)
-    end
+    @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, U, T)
+    # if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
+    #     @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 4, 6)
+    # else
+    #     # @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 3, 4)
+    #     @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :i, :j, :j, 4, 4)
+    # end
     # my colsum is wrong (by 0.25), but slightly more interesting
     function mycolsum!(x, A)
         @. x = 0

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`function msdavx(x)`
`13`	`13`	`s = zero(eltype(x))`
`14`	`14`	`@avx for i in eachindex(x)`
`15`		`- s += x[i] * x[i]`
	`15`	`+ s = muladd(x[i], x[i], s) # Avoids fastmath in fallback loop.`
`16`	`16`	`end`
`17`	`17`	`s`
`18`	`18`	`end`