Skip to content

Commit 053c8d2

Browse files
committed
Minor doc updates and update non-AVX512 tests.
1 parent 6251cb7 commit 053c8d2

File tree

10 files changed

+71
-68
lines changed

10 files changed

+71
-68
lines changed

benchmark/looptests.jl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -259,17 +259,17 @@ function randomaccessavx(P, basis, coeffs::Vector{T}) where {T}
259259
end
260260
return p
261261
end
262-
function jlogdettriangle(T::Union{LowerTriangular,UpperTriangular})
262+
function jlogdettriangle(B::Union{LowerTriangular,UpperTriangular})
263263
ld = 0.0
264-
@inbounds for n 1:size(T,1)
265-
ld += log(T[n,n])
264+
@inbounds @fastmath for n 1:size(B,1)
265+
ld += log(B[n,n])
266266
end
267267
ld
268268
end
269-
function jlogdettriangleavx(T::Union{LowerTriangular,UpperTriangular})
270-
A = parent(T) # No longer supported
269+
function jlogdettriangleavx(B::Union{LowerTriangular,UpperTriangular})
270+
A = parent(B) # No longer supported
271271
ld = zero(eltype(A))
272-
@avx for n 1:size(T,1)
272+
@avx for n axes(A,1)
273273
ld += log(A[n,n])
274274
end
275275
ld

docs/make.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ makedocs(;
1010
"examples/matrix_multiplication.md",
1111
"examples/matrix_vector_ops.md",
1212
"examples/dot_product.md",
13+
"examples/special_functions.md",
1314
"examples/sum_of_squared_error.md",
1415
"examples/filtering.md"
1516
],

docs/src/assets/bench_logdettriangle_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/examples/matrix_vector_ops.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Here I'll discuss a variety of Matrix-vector operations, naturally starting with matrix-vector multiplication.
44

55
```julia
6-
@inline function jgemvavx!(𝐲, 𝐀, 𝐱)
6+
function jgemvavx!(𝐲, 𝐀, 𝐱)
77
@avx for i eachindex(𝐲)
88
𝐲ᵢ = zero(eltype(𝐲))
99
for j eachindex(𝐱)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Special Functions
2+
3+
`LoopVectorization` supports vectorizing many special functions, for example, to calculate the log determinant of a triangular matrix:
4+
```julia
5+
function logdettriangle(B::Union{LowerTriangular,UpperTriangular})
6+
A = parent(B) # using a triangular matrix would fall back to the default loop.
7+
ld = zero(eltype(A))
8+
@avx for n axes(A,1)
9+
ld += log(A[n,n])
10+
end
11+
ld
12+
end
13+
```
14+
![selfdot](../assets/bench_logdettriangle_v1.svg)
15+
16+
While Intel's proprietary compilers do the best, LoopVectorization performs very well among open source alternatives. A complicating
17+
factor to the above benchmark is that in accessing the diagonals, we are not accessing contiguous elements. A benchmark
18+
simply exponentiating a vector shows that `gcc` also has efficient special function vectorization, but that the autovectorizer
19+
disagrees with the discontiguous memory acesses:
20+
21+
![selfdot](../assets/bench_exp_v1.svg)
22+
23+
The similar performance between `gfortran` and `LoopVectorization` at multiples of 8 is no fluke: on Linux systems with a recent GLIBC, SLEEFPirates.jl --
24+
which LoopVectorization depends on to vectorize these special functions -- looks for the GNU vector library and uses these functions
25+
if available. Otherwise, it will use native Julia implementations that tend to be slower. As the modulus of vector length and vector width (8, on the
26+
host system thanks to AVX512) increases, `gfortran` shows the performance degredation pattern typical of LLVM-vectorized code.
27+

docs/src/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Pages = [
1212
"examples/matrix_vector_ops.md",
1313
"examples/dot_product.md",
1414
"examples/filtering.md",
15+
"examples/special_functions.md",
1516
"examples/sum_of_squared_error.md",
1617
"vectorized_convenience_functions.md",
1718
"future_work.md",

src/condense_loopset.jl

Lines changed: 4 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -202,46 +202,6 @@ end
202202
@inline array_wrapper(A::Adjoint) = Adjoint
203203
@inline array_wrapper(A::SubArray) = A.indices
204204

205-
206-
# If you change the number of arguments here, make commensurate changes
207-
# to the `insert!` locations in `setup_call_noinline`.
208-
@generated function __avx__!(
209-
::Val{UNROLL}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB,
210-
::Val{AR}, ::Val{D}, ::Val{IND}, subsetvals, arraydescript, vargs::Vararg{<:Any,N}
211-
) where {UNROLL, OPS, ARF, AM, LPSYM, LB, N, AR, D, IND}
212-
1 + 1
213-
num_vptrs = length(ARF.parameters)::Int
214-
vptrs = [gensym(:vptr) for _ 1:num_vptrs]
215-
call = Expr(:call, lv(:_avx_!), Val{UNROLL}(), OPS, ARF, AM, LPSYM, :lb)
216-
for n 1:num_vptrs
217-
push!(call.args, vptrs[n])
218-
end
219-
q = Expr(:block)
220-
j = 0
221-
assigned_names = Vector{Symbol}(undef, length(AR))
222-
num_arrays = 0
223-
for i eachindex(AR)
224-
ari = (AR[i])::Int
225-
ind = (IND[i])::Union{Nothing,Int}
226-
LHS = ind === nothing ? gensym() : vptrs[ind]
227-
assigned_names[i] = LHS
228-
d = (D[i])::Union{Nothing,Int}
229-
if d === nothing
230-
num_arrays += 1
231-
RHS = Expr(:call, lv(:stridedpointer), Expr(:ref, :vargs, ari), Expr(:ref, :arraydescript, ari))
232-
else #subsetview
233-
j += 1
234-
RHS = Expr(:call, :subsetview, assigned_names[ari], Expr(:call, Expr(:curly, :Val, d)), Expr(:ref, :subsetvals, j))
235-
end
236-
push!(q.args, Expr(:(=), LHS, RHS))
237-
end
238-
for n num_arrays+1:N
239-
push!(call.args, Expr(:ref, :vargs, n))
240-
end
241-
push!(q.args, call)
242-
Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), q)
243-
end
244-
245205
# Try to condense in type stable manner
246206
function generate_call(ls::LoopSet, inline_unroll::NTuple{3,Int8}, debug::Bool = false)
247207
operation_descriptions = Expr(:curly, :Tuple)
@@ -330,6 +290,9 @@ function check_args_call(ls::LoopSet)
330290
q
331291
end
332292

293+
make_fast(q) = Expr(:macrocall, Symbol("@fastmath"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), q)
294+
make_crashy(q) = Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), q)
295+
make_fast_and_crashy(q) = q |> make_fast |> make_crashy
333296

334297
function setup_call_inline(ls::LoopSet, inline::Int8 = zero(Int8), U::Int8 = zero(Int8), T::Int8 = zero(Int8))
335298
call = generate_call(ls, (inline,U,T))
@@ -369,5 +332,5 @@ function setup_call(ls::LoopSet, q = nothing, inline::Int8 = zero(Int8), u₁::I
369332
# inlining the generated function into the loop preamble.
370333
call = setup_call_inline(ls, inline, u₁, u₂)
371334
isnothing(q) && return Expr(:block, ls.prepreamble, call)
372-
Expr(:block, ls.prepreamble, Expr(:if, check_args_call(ls), call, q))
335+
Expr(:block, ls.prepreamble, Expr(:if, check_args_call(ls), call, make_fast_and_crashy(q)))
373336
end

src/constructors.jl

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -101,14 +101,16 @@ using keyword arguments:
101101
102102
where `body` is the code of the block (e.g., `for ... end`).
103103
104-
`inline` is a Boolean. When `true` (the default), `body` will be directly inlined
104+
`inline` is a Boolean. When `true`, `body` will be directly inlined
105105
into the function (via a forced-inlining call to `_avx_!`).
106-
When `false`, it will call `__avx__!` instead, letting Julia's own inlining engine
107-
determine whether the call to `__avx__!` should be inlined. (Typically, it won't.)
108-
In priniciple, first calling `__avx__!` (which itself calls `_avx_!`) can sometimes
109-
allow better code generation.
106+
When `false`, it wont force inlining of the call to `_avx_!` instead, letting Julia's own inlining engine
107+
determine whether the call to `_avx_!` should be inlined. (Typically, it won't.)
108+
Sometimes not inlining can lead to substantially worse code generation, and >40% regressions, even in very
109+
large problems (2-d convolutions are a case where this has been observed).
110110
One can find some circumstances where `inline=true` is faster, and other circumstances
111-
where `inline=false` is faster, so the best setting may require experimentation.
111+
where `inline=false` is faster, so the best setting may require experimentation. By default, the macro
112+
tries to guess. Currently the algorithm is simple: roughly, if there are more than two dynamically sized loops
113+
or and no convolutions, it will probably not force inlining. Otherwise, it probably will.
112114
113115
`unroll` is an integer that specifies the loop unrolling factor, or a
114116
tuple `(u₁, u₂) = (4, 2)` signaling that the generated code should unroll more than
@@ -117,6 +119,13 @@ but it applies to the loop ordering and unrolling that will be chosen by LoopVec
117119
*not* the order in `body`.
118120
`uᵢ=0` (the default) indicates that LoopVectorization should pick its own value,
119121
and `uᵢ=-1` disables unrolling for the correspond loop.
122+
123+
The `@avx` macro also checks the array arguments using `LoopVectorization.check_args` to try and determine
124+
if they are compatible with the macro. If `check_args` returns false, a fall back loop annotated with `@inbounds`
125+
and `@fastmath` is generated. Note that `SIMDPirates` provides functions such as `evadd` and `evmul` that will
126+
ignore `@fastmath`, preserving IEEE semantics both within `@avx` and `@fastmath`.
127+
`check_args` currently returns false for some wrapper types like `LinearAlgebra.UpperTriangular`, requiring you to
128+
use their `parent`. Triangular loops aren't yet supported.
120129
"""
121130
macro avx(q)
122131
q = macroexpand(__module__, q)

test/fallback.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
function msdavx(x)
1313
s = zero(eltype(x))
1414
@avx for i in eachindex(x)
15-
s += x[i] * x[i]
15+
s = muladd(x[i], x[i], s) # Avoids fastmath in fallback loop.
1616
end
1717
s
1818
end

test/miscellaneous.jl

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,13 @@ using Test
7272
B[j,i] = A[j,i] - x[j]
7373
end)
7474
lssubcol = LoopVectorization.LoopSet(subcolq);
75-
if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
76-
@test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, 4, 6)
77-
else
78-
# @test LoopVectorization.choose_order(lssubcol) == ([:j, :i], :j, :i, :j, 3, 4)#&-2
79-
@test LoopVectorization.choose_order(lssubcol) == ([:j, :i], :i, :j, :j, 4, 4)#&-2
80-
end
75+
@test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, U, T)
76+
# if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
77+
# @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, 4, 6)
78+
# else
79+
# # @test LoopVectorization.choose_order(lssubcol) == ([:j, :i], :j, :i, :j, 3, 4)#&-2
80+
# @test LoopVectorization.choose_order(lssubcol) == ([:j, :i], :i, :j, :j, 4, 4)#&-2
81+
# end
8182
## @avx is SLOWER!!!!
8283
## need to fix!
8384
function mysubcol!(B, A, x)
@@ -102,12 +103,13 @@ using Test
102103
x[j] += A[j,i] - 0.25
103104
end)
104105
lscolsum = LoopVectorization.LoopSet(colsumq);
105-
if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
106-
@test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 4, 6)
107-
else
108-
# @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 3, 4)
109-
@test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :i, :j, :j, 4, 4)
110-
end
106+
@test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, U, T)
107+
# if LoopVectorization.VectorizationBase.REGISTER_COUNT == 32
108+
# @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 4, 6)
109+
# else
110+
# # @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 3, 4)
111+
# @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :i, :j, :j, 4, 4)
112+
# end
111113
# my colsum is wrong (by 0.25), but slightly more interesting
112114
function mycolsum!(x, A)
113115
@. x = 0

0 commit comments

Comments
 (0)