Skip to content

Commit fb4037a

Browse files
committed
Added more documentation.
1 parent 5ad30d9 commit fb4037a

15 files changed

+370
-14
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,3 +331,10 @@ Similar approaches can be taken to make kernels working with a variety of numeri
331331

332332
</p>
333333
</details>
334+
335+
## Packages using LoopVectorization
336+
337+
* [Gaius](https://github.com/MasonProtter/Gaius.jl)
338+
339+
If you're using LoopVectorization, please feel free to file a PR adding yours to the list!
340+

benchmark/looptests.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,17 @@ double dot3(double* restrict x, double* restrict A, double* restrict y, long M,
140140
}
141141
return s;
142142
}
143+
double dot3v2(double* restrict x, double* restrict A, double* restrict y, long M, long N){
144+
double s = 0.0;
145+
for (long n = 0; n < N; n++){
146+
double t = 0.0;
147+
for (long m = 0; m < M; m++){
148+
t += x[m] * A[m + n*M];
149+
}
150+
s += t * y[n];
151+
}
152+
return s;
153+
}
143154
void gemv(double* restrict y, double* restrict A, double* restrict x, long M, long K){
144155
for (long m = 0; m < M; m++){
145156
y[m] = 0.0;

benchmark/looptests.f90

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ subroutine AmulBt(C, A, B, M, K, N) BIND(C, name="AmulBt")
103103
real(C_double), dimension(M, K), intent(in) :: A
104104
real(C_double), dimension(N, K), intent(in) :: B
105105
integer(C_long) :: mm, kk, nn
106-
C = 0.0
106+
C = 0.0d0
107107
do concurrent(kk = 1:K, nn = 1:N, mm = 1:M)
108108
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(nn,kk)
109109
end do
@@ -121,7 +121,7 @@ subroutine AtmulBt(C, A, B, M, K, N) BIND(C, name="AtmulBt")
121121
real(C_double), dimension(K, M), intent(in) :: A
122122
real(C_double), dimension(N, K), intent(in) :: B
123123
integer(C_long) :: mm, kk, nn
124-
C = 0.0
124+
C = 0.0d0
125125
do concurrent(nn = 1:N, kk = 1:K, mm = 1:M)
126126
C(mm,nn) = C(mm,nn) + A(kk,mm) * B(nn,kk)
127127
end do
@@ -138,7 +138,7 @@ subroutine dot(s, a, b, N) BIND(C, name="dot")
138138
real(C_double), dimension(N), intent(in) :: a, b
139139
real(C_double), intent(out) :: s
140140
integer(C_long) :: i
141-
s = 0
141+
s = 0d0
142142
do concurrent(i = 1:N)
143143
s = s + a(i) * b(i)
144144
end do
@@ -148,7 +148,7 @@ subroutine selfdot(s, a, N) BIND(C, name="selfdot")
148148
real(C_double), dimension(N), intent(in) :: a
149149
real(C_double), intent(out) :: s
150150
integer(C_long) :: i
151-
s = 0
151+
s = 0d0
152152
do concurrent(i = 1:N)
153153
s = s + a(i) * a(i)
154154
end do
@@ -157,11 +157,34 @@ subroutine dot3(s, x, A, y, M, N) BIND(C, name="dot3")
157157
integer(C_long), intent(in) :: M, N
158158
real(C_double), intent(in) :: x(M), A(M,N), y(N)
159159
real(C_double), intent(out) :: s
160+
real(C_double) :: t
160161
integer(C_long) :: mm, nn
162+
s = 0.0d0
161163
do concurrent(nn = 1:N, mm = 1:M)
162164
s = s + x(mm) * A(mm, nn) * y(nn)
163165
end do
164166
end subroutine dot3
167+
subroutine dot3v2(s, x, A, y, M, N) BIND(C, name="dot3v2")
168+
integer(C_long), intent(in) :: M, N
169+
real(C_double), intent(in) :: x(M), A(M,N), y(N)
170+
real(C_double), intent(out) :: s
171+
real(C_double) :: t
172+
integer(C_long) :: mm, nn
173+
s = 0.0d0
174+
do concurrent(nn = 1:N)
175+
t = 0.0d0
176+
do concurrent(mm = 1:M)
177+
t = t + x(mm) * A(mm, nn)
178+
end do
179+
s = s + t * y(nn)
180+
end do
181+
end subroutine dot3v2
182+
subroutine dot3builtin(s, x, A, y, M, N) BIND(C, name="dot3builtin")
183+
integer(C_long), intent(in) :: M, N
184+
real(C_double), intent(in) :: x(M), A(M,N), y(N)
185+
real(C_double), intent(out) :: s
186+
s = dot_product(x, matmul(A, y))
187+
end subroutine dot3builtin
165188
!GCC$ builtin (exp) attributes simd (notinbranch) if('x86_64')
166189
subroutine vexp(b, a, N) BIND(C, name="vexp")
167190
integer(C_long), intent(in) :: N

benchmark/looptests.jl

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,21 +81,43 @@ end
8181
function jdot3(x, A, y)
8282
M, N = size(A)
8383
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
84-
@inbounds for n 1:N
85-
@simd ivdep for m 1:M
86-
@fastmath s += x[m] * A[m,n] * y[n]
87-
end
84+
@inbounds @fastmath for n 1:N, m 1:M
85+
s += x[m] * A[m,n] * y[n]
8886
end
8987
s
9088
end
9189
function jdot3avx(x, A, y)
9290
M, N = size(A)
9391
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
94-
@avx for m 1:M, n 1:N
92+
@avx for n 1:N, m 1:M
9593
s += x[m] * A[m,n] * y[n]
9694
end
9795
s
9896
end
97+
function jdot3v2(x, A, y)
98+
M, N = size(A)
99+
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
100+
@inbounds @fastmath for n 1:N
101+
t = zero(s)
102+
@simd ivdep for m 1:M
103+
t += x[m] * A[m,n]
104+
end
105+
s += t * y[n]
106+
end
107+
s
108+
end
109+
function jdot3v2avx(x, A, y)
110+
M, N = size(A)
111+
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
112+
@avx for n 1:N
113+
t = zero(s)
114+
for m 1:M
115+
t += x[m] * A[m,n]
116+
end
117+
s += t * y[n]
118+
end
119+
s
120+
end
99121
function jvexp!(b, a)
100122
@inbounds for i eachindex(a)
101123
b[i] = exp(a[i])

docs/make.jl

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,21 @@ makedocs(;
66
pages=[
77
"Home" => "index.md",
88
"Getting Started" => "getting_started.md",
9-
"Examples" => Any[
9+
"Examples" => [
1010
"examples/matrix_multiplication.md",
1111
"examples/matrix_vector_ops.md",
1212
"examples/dot_product.md",
1313
"examples/sum_of_squared_error.md"
1414
],
1515
"Vectorized Convenience Functions" => "vectorized_convenience_functions.md",
16-
"Future Work" => "future_work.md"
16+
"Future Work" => "future_work.md",
17+
"Developer Documentation" => [
18+
"devdocs/overview.md",
19+
"devdocs/loopset_structure.md",
20+
"devdocs/constructing_loopsets.md",
21+
"devdocs/evaluating_loops.md",
22+
"devdocs/lowering.md"
23+
]
1724
],
1825
# repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",
1926
sitename="LoopVectorization.jl",
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
2+
## Constructing LoopSets
3+
4+
When applying the `@avx` macro to a broadcast expression, the LoopSet object is constructed by recursively evaluating [add_broadcast!](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/broadcast.jl#L166) on all the fields. The function and involved operations are their relationships are straightforward to infer from the structure of nested broadcasts.
5+
```julia
6+
julia> Meta.@lower @. f(g(a,b) + c) / d
7+
:($(Expr(:thunk, CodeInfo(
8+
@ none within `top-level scope'
9+
1 ─ %1 = Base.broadcasted(g, a, b)
10+
│ %2 = Base.broadcasted(+, %1, c)
11+
│ %3 = Base.broadcasted(f, %2)
12+
│ %4 = Base.broadcasted(/, %3, d)
13+
│ %5 = Base.materialize(%4)
14+
└── return %5
15+
))))
16+
17+
julia> @macroexpand @avx @. f(g(a,b) + c) / d
18+
quote
19+
var"##262" = Base.broadcasted(g, a, b)
20+
var"##263" = Base.broadcasted(+, var"##262", c)
21+
var"##264" = Base.broadcasted(f, var"##263")
22+
var"##265" = Base.broadcasted(/, var"##264", d)
23+
var"##266" = LoopVectorization.vmaterialize(var"##265", Val{:Main}())
24+
end
25+
```
26+
These nested broadcasted objects already express information very similar to what the `LoopSet` objects hold. The dimensionality of the objects provides the information on the associated loop dependencies.
27+
28+
When applying `@avx` to a loop expression, it creates a `LoopSet` without awareness to type information, and then [condenses the information](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/condense_loopset.jl) into a summary which is passed as type information to a generated function.
29+
```julia
30+
julia> @macroexpand @avx for m ∈ 1:M, n ∈ 1:N
31+
C[m,n] = zero(eltype(B))
32+
for k ∈ 1:K
33+
C[m,n] += A[m,k] * B[k,n]
34+
end
35+
end
36+
quote
37+
var"##vptr##_C" = LoopVectorization.stridedpointer(C)
38+
var"##vptr##_A" = LoopVectorization.stridedpointer(A)
39+
var"##vptr##_B" = LoopVectorization.stridedpointer(B)
40+
begin
41+
$(Expr(:gc_preserve, :(LoopVectorization._avx_!(Val{(0, 0)}(), Tuple{:numericconstant, Symbol("##zero#270"), LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01), :LoopVectorization, :setindex!, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000007, LoopVectorization.memstore, 0x01, 0x02), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000013, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x03), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000032, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x03, 0x04), :numericconstant, Symbol("##reductzero#274"), LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000003, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x05), :LoopVectorization, :vfmadd_fast, LoopVectorization.OperationStruct(0x0000000000000132, 0x0000000000000003, 0x0000000000000000, 0x0000000000030405, LoopVectorization.compute, 0x00, 0x05), :LoopVectorization, :reduce_to_add, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000003, 0x0000000000000000, 0x0000000000000601, LoopVectorization.compute, 0x00, 0x01)}, Tuple{LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102, 0xffffffffffffe03b), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000103, 0xffffffffffffffd6), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000302, 0xffffffffffffe056), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102, 0xffffffffffffffd6)}, Tuple{0, Tuple{}, Tuple{}, Tuple{}, Tuple{}, Tuple{(1, LoopVectorization.IntOrFloat), (5, LoopVectorization.IntOrFloat)}, Tuple{}}, (LoopVectorization.StaticLowerUnitRange{0}(M), LoopVectorization.StaticLowerUnitRange{0}(N), LoopVectorization.StaticLowerUnitRange{0}(K)), var"##vptr##_C", var"##vptr##_A", var"##vptr##_B", var"##vptr##_C")), :C, :A, :B))
42+
end
43+
end
44+
```
45+
This summary is then [reconstruced](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/reconstruct_loopset.jl) using the available type information. This type information can be used, for example, to realize an array has been tranposed, and thus correctly identify which axis contains contiguous elements that are efficient to load from. This is why
46+
The three chief components of the summaries are the definitions of operations, e.g.:
47+
```julia
48+
:LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000013, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x03)
49+
```
50+
the referenced array objects:
51+
```julia
52+
LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102, 0xffffffffffffe03b)
53+
```
54+
and the set of loop bounds:
55+
```julia
56+
(LoopVectorization.StaticLowerUnitRange{0}(M), LoopVectorization.StaticLowerUnitRange{0}(N), LoopVectorization.StaticLowerUnitRange{0}(K))
57+
```
58+
59+
60+
61+

docs/src/devdocs/evaluating_loops.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
2+
## Determining the strategy for evaluating loops
3+
4+
The heart of the optimizatizations performed by LoopVectorization are given in the [determinestrategy.jl](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/determinestrategy.jl) file utilizing instruction costs specified in [costs.jl](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/costs.jl).
5+
Essentially, it estimates the cost of different means of evaluating the loops. It iterates through the different possible loop orders, as well as considering which loops to unroll, and which to vectorize. It will consider unrolling 1 or 2 loops (but it could settle on unrolling by a factor of 1, i.e. not unrolling), and vectorizing 1.
6+

docs/src/devdocs/loopset_structure.md

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
2+
## LoopSet Structure
3+
4+
The loopsets define loops as a set of operations that depend on one another, and also on loops. Cycles are not allowed, making it a directed acyclic graph. Currently, only single return values are supported.
5+
Lets use a set of nested loops performing matrix multiplication as an example. We can create a naive `LoopSet` from an expression (naive due to being created without access to any type information):
6+
```julia
7+
julia> using LoopVectorization
8+
9+
julia> AmulBq = :(for m 1:M, n 1:N
10+
C[m,n] = zero(eltype(B))
11+
for k 1:K
12+
C[m,n] += A[m,k] * B[k,n]
13+
end
14+
end);
15+
16+
julia> lsAmulB = LoopVectorization.LoopSet(AmulBq);
17+
```
18+
This LoopSet consists of seven operations that define the relationships within the loop:
19+
```julia
20+
julia> LoopVectorization.operations(lsAmulB)
21+
7-element Array{LoopVectorization.Operation,1}:
22+
var"##RHS#256" = var"##zero#257"
23+
C[m, n] = var"##RHS#256"
24+
var"##tempload#258" = A[m, k]
25+
var"##tempload#259" = B[k, n]
26+
var"##reduction#260" = var"##reductzero#261"
27+
var"##reduction#260" = LoopVectorization.vfmadd_fast(var"##tempload#258", var"##tempload#259", var"##reduction#260")
28+
var"##RHS#256" = LoopVectorization.reduce_to_add(var"##reduction#260", var"##RHS#256")
29+
```
30+
The act of performing a "reduction" across a loop introduces a few extra operations that manage creating a "zero" with respect to the reduction, and then combining with the specified value using `reduce_to_add`, which performs any necessary type conversions, such as from an `SVec` vector-type to a scalar, if necessary. This simplifies code generation, by making the functions agnostic with respect to the actual vectorization decisions the library makes.
31+
32+
Each operation is listed as depending on a set of loop iteration symbols:
33+
```julia
34+
julia> LoopVectorization.loopdependencies.(LoopVectorization.operations(lsAmulB))
35+
7-element Array{Array{Symbol,1},1}:
36+
[:m, :n]
37+
[:m, :n]
38+
[:m, :k]
39+
[:k, :n]
40+
[:m, :n]
41+
[:m, :k, :n]
42+
[:m, :n]
43+
```
44+
We can also see which of the operations each of these operations depend on:
45+
```julia
46+
julia> LoopVectorization.operations(lsAmulB)[6]
47+
var"##reduction#260" = LoopVectorization.vfmadd_fast(var"##tempload#258", var"##tempload#259", var"##reduction#260")
48+
49+
julia> LoopVectorization.parents(ans)
50+
3-element Array{LoopVectorization.Operation,1}:
51+
var"##tempload#258" = A[m, k]
52+
var"##tempload#259" = B[k, n]
53+
var"##reduction#260" = var"##reductzero#261"
54+
```
55+
References to arrays are represtened with an `ArrayReferenceMeta` data structure:
56+
```julia
57+
julia> LoopVectorization.operations(lsAmulB)[3].ref
58+
LoopVectorization.ArrayReferenceMeta(LoopVectorization.ArrayReference(:A, [:m, :k], Int8[0, 0]), Bool[1, 1], Symbol("##vptr##_A"))
59+
```
60+
It contains the name of the parent array (`:A`), the indicies `[:m,:k]`, and a boolean vector (`Bool[1, 1]`) indicating whether these indices are loop iterables. Note that the optimizer assumes arrays are column-major, and thus that it is efficient to read contiguous elements from the first index. In lower level terms, it means that [high-throughput vmov](https://www.felixcloutier.com/x86/movupd) instructions can be used rather than [low-throughput](https://www.felixcloutier.com/x86/vgatherdpd:vgatherqpd) [gathers](https://www.felixcloutier.com/x86/vgatherqps:vgatherqpd). Similar story for storing elements.
61+
When no axis has unit stride, the first given index will be the dummy `Symbol("##DISCONTIGUOUSSUBARRAY##")`.
62+
63+

docs/src/devdocs/lowering.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
## Lowering
3+
4+
The first step to lowering is picking a strategy for lowering the loops. Then a Julia expression is created following that strategy, converting each of the operations into Julia expressions.
5+
This task is made simpler via multiple dispatch making the lowering of the components independent of the larger picture. For example, a load will look like
6+
```julia
7+
vload(vptr_A, (i,j,k))
8+
```
9+
with the behavior of this load determined by the types of the arguments. Vectorization is expressed by making an index a `_MM{W}` type, rather than an integer, and operations with it will either produce another `_MM{W}` when it will still correspond to contiguous loads, or an `SVec{W,<:Integer}` if the resulting loads will be discontiguous, so that a `gather` or `scatter!` will be used. If all indexes are simply integers, then this produces a scalar load or store.
10+
11+

docs/src/devdocs/overview.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
2+
## Developer Overview
3+
4+
Here I will try to explain how the library works for the curious or any would-be contributors.
5+
6+
The library uses a [LoopSet](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/graphs.jl#L146) object to model loops. The key components of the library can be divided into:
7+
1. Defining the LoopSet objects.
8+
2. Constructing the LoopSet objects.
9+
3. Determining the strategy of how to evaluate loops.
10+
4. Lowering the loopset object into a Julia `Expr` following a strategy.
11+
12+
13+

0 commit comments

Comments
 (0)