Added more documentation.

chriselrod · chriselrod · commit fb4037adbb27 · 2020-03-07T22:42:37.000-05:00
diff --git a/README.md b/README.md
@@ -331,3 +331,10 @@ Similar approaches can be taken to make kernels working with a variety of numeri
 
 </p>
 </details>
+
+## Packages using LoopVectorization
+
+* [Gaius](https://github.com/MasonProtter/Gaius.jl)
+
+If you're using LoopVectorization, please feel free to file a PR adding yours to the list!
+
diff --git a/benchmark/looptests.c b/benchmark/looptests.c
@@ -140,6 +140,17 @@ double dot3(double* restrict x, double* restrict A, double* restrict y, long M,
   }
   return s;
 }
+double dot3v2(double* restrict x, double* restrict A, double* restrict y, long M, long N){
+  double s = 0.0;
+  for (long n = 0; n < N; n++){
+    double t = 0.0;
+    for (long m = 0; m < M; m++){
+      t += x[m] * A[m + n*M];
+    }
+    s += t * y[n];
+  }
+  return s;
+}
 void gemv(double* restrict y, double* restrict  A, double* restrict x, long M, long K){
   for (long m = 0; m < M; m++){
     y[m] = 0.0;
diff --git a/benchmark/looptests.f90 b/benchmark/looptests.f90
@@ -103,7 +103,7 @@ subroutine AmulBt(C, A, B, M, K, N) BIND(C, name="AmulBt")
       real(C_double), dimension(M, K), intent(in) :: A
       real(C_double), dimension(N, K), intent(in) :: B
       integer(C_long) :: mm, kk, nn
-      C = 0.0
+      C = 0.0d0
       do concurrent(kk = 1:K, nn = 1:N, mm = 1:M)
          C(mm,nn) = C(mm,nn) + A(mm,kk) * B(nn,kk)
       end do
@@ -121,7 +121,7 @@ subroutine AtmulBt(C, A, B, M, K, N) BIND(C, name="AtmulBt")
       real(C_double), dimension(K, M), intent(in) :: A
       real(C_double), dimension(N, K), intent(in) :: B
       integer(C_long) :: mm, kk, nn
-      C = 0.0
+      C = 0.0d0
       do concurrent(nn = 1:N, kk = 1:K, mm = 1:M)
          C(mm,nn) = C(mm,nn) + A(kk,mm) * B(nn,kk)
       end do
@@ -138,7 +138,7 @@ subroutine dot(s, a, b, N) BIND(C, name="dot")
       real(C_double), dimension(N), intent(in) :: a, b
       real(C_double), intent(out) :: s
       integer(C_long) :: i
-      s = 0
+      s = 0d0
       do concurrent(i = 1:N)
          s = s + a(i) * b(i)
       end do
@@ -148,7 +148,7 @@ subroutine selfdot(s, a, N) BIND(C, name="selfdot")
       real(C_double), dimension(N), intent(in) :: a
       real(C_double), intent(out) :: s
       integer(C_long) :: i
-      s = 0
+      s = 0d0
       do concurrent(i = 1:N)
          s = s + a(i) * a(i)
       end do
@@ -157,11 +157,34 @@ subroutine dot3(s, x, A, y, M, N) BIND(C, name="dot3")
       integer(C_long), intent(in) :: M, N
       real(C_double), intent(in) :: x(M), A(M,N), y(N)
       real(C_double), intent(out) :: s
+      real(C_double) :: t
       integer(C_long) :: mm, nn
+      s = 0.0d0
       do concurrent(nn = 1:N, mm = 1:M)
          s = s + x(mm) * A(mm, nn) * y(nn)
       end do
     end subroutine dot3
+    subroutine dot3v2(s, x, A, y, M, N) BIND(C, name="dot3v2")
+      integer(C_long), intent(in) :: M, N
+      real(C_double), intent(in) :: x(M), A(M,N), y(N)
+      real(C_double), intent(out) :: s
+      real(C_double) :: t
+      integer(C_long) :: mm, nn
+      s = 0.0d0
+      do concurrent(nn = 1:N)
+         t = 0.0d0
+         do concurrent(mm = 1:M)
+            t = t + x(mm) * A(mm, nn)
+         end do
+         s = s + t * y(nn)
+      end do
+    end subroutine dot3v2
+    subroutine dot3builtin(s, x, A, y, M, N) BIND(C, name="dot3builtin")
+      integer(C_long), intent(in) :: M, N
+      real(C_double), intent(in) :: x(M), A(M,N), y(N)
+      real(C_double), intent(out) :: s
+      s = dot_product(x, matmul(A, y))
+    end subroutine dot3builtin
     !GCC$ builtin (exp) attributes simd (notinbranch) if('x86_64')
     subroutine vexp(b, a, N) BIND(C, name="vexp")
       integer(C_long), intent(in) :: N
diff --git a/benchmark/looptests.jl b/benchmark/looptests.jl
@@ -81,21 +81,43 @@ end
 function jdot3(x, A, y)
     M, N = size(A)
     s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
-    @inbounds for n ∈ 1:N
-        @simd ivdep for m ∈ 1:M
-            @fastmath s += x[m] * A[m,n] * y[n]
-        end
+    @inbounds @fastmath for n ∈ 1:N, m ∈ 1:M
+        s += x[m] * A[m,n] * y[n]
     end
     s
 end
 function jdot3avx(x, A, y)
     M, N = size(A)
     s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
-    @avx for m ∈ 1:M, n ∈ 1:N
+    @avx for n ∈ 1:N, m ∈ 1:M
         s += x[m] * A[m,n] * y[n]
     end
     s
 end
+function jdot3v2(x, A, y)
+    M, N = size(A)
+    s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
+    @inbounds @fastmath for n ∈ 1:N
+        t = zero(s)
+        @simd ivdep for m ∈ 1:M
+            t += x[m] * A[m,n]
+        end
+        s += t * y[n]
+    end
+    s
+end
+function jdot3v2avx(x, A, y)
+    M, N = size(A)
+    s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
+    @avx for n ∈ 1:N
+        t = zero(s)
+        for m ∈ 1:M
+            t += x[m] * A[m,n]
+        end
+        s += t * y[n]
+    end
+    s
+end
 function jvexp!(b, a)
     @inbounds for i ∈ eachindex(a)
         b[i] = exp(a[i])
diff --git a/docs/make.jl b/docs/make.jl
@@ -6,14 +6,21 @@ makedocs(;
     pages=[
         "Home" => "index.md",
         "Getting Started" => "getting_started.md",
-        "Examples" => Any[
+        "Examples" => [
             "examples/matrix_multiplication.md",
             "examples/matrix_vector_ops.md",
             "examples/dot_product.md",
             "examples/sum_of_squared_error.md"
         ],
         "Vectorized Convenience Functions" => "vectorized_convenience_functions.md",
-        "Future Work" => "future_work.md"
+        "Future Work" => "future_work.md",
+        "Developer Documentation" => [
+            "devdocs/overview.md",
+            "devdocs/loopset_structure.md",
+	    "devdocs/constructing_loopsets.md",
+	    "devdocs/evaluating_loops.md",
+	    "devdocs/lowering.md"
+        ]
     ],
     # repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",
     sitename="LoopVectorization.jl",
diff --git a/docs/src/devdocs/constructing_loopsets.md b/docs/src/devdocs/constructing_loopsets.md
@@ -0,0 +1,61 @@
+
+## Constructing LoopSets
+
+When applying the `@avx` macro to a broadcast expression, the LoopSet object is constructed by recursively evaluating [add_broadcast!](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/broadcast.jl#L166) on all the fields. The function and involved operations are their relationships are straightforward to infer from the structure of nested broadcasts.
+```julia
+julia> Meta.@lower @. f(g(a,b) + c) / d
+:($(Expr(:thunk, CodeInfo(
+    @ none within `top-level scope'
+1 ─ %1 = Base.broadcasted(g, a, b)
+│   %2 = Base.broadcasted(+, %1, c)
+│   %3 = Base.broadcasted(f, %2)
+│   %4 = Base.broadcasted(/, %3, d)
+│   %5 = Base.materialize(%4)
+└──      return %5
+))))
+
+julia> @macroexpand @avx @. f(g(a,b) + c) / d
+quote
+    var"##262" = Base.broadcasted(g, a, b)
+    var"##263" = Base.broadcasted(+, var"##262", c)
+    var"##264" = Base.broadcasted(f, var"##263")
+    var"##265" = Base.broadcasted(/, var"##264", d)
+    var"##266" = LoopVectorization.vmaterialize(var"##265", Val{:Main}())
+end
+```
+These nested broadcasted objects already express information very similar to what the `LoopSet` objects hold. The dimensionality of the objects provides the information on the associated loop dependencies.
+
+When applying `@avx` to a loop expression, it creates a `LoopSet` without awareness to type information, and then [condenses the information](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/condense_loopset.jl) into a summary which is passed as type information to a generated function.
+```julia
+julia> @macroexpand @avx for m ∈ 1:M, n ∈ 1:N
+           C[m,n] = zero(eltype(B))
+           for k ∈ 1:K
+               C[m,n] += A[m,k] * B[k,n]
+           end
+       end
+quote
+    var"##vptr##_C" = LoopVectorization.stridedpointer(C)
+    var"##vptr##_A" = LoopVectorization.stridedpointer(A)
+    var"##vptr##_B" = LoopVectorization.stridedpointer(B)
+    begin
+        $(Expr(:gc_preserve, :(LoopVectorization._avx_!(Val{(0, 0)}(), Tuple{:numericconstant, Symbol("##zero#270"), LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01), :LoopVectorization, :setindex!, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000000, 0x0000000000000007, LoopVectorization.memstore, 0x01, 0x02), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000013, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x03), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000032, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x03, 0x04), :numericconstant, Symbol("##reductzero#274"), LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000000, 0x0000000000000003, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x05), :LoopVectorization, :vfmadd_fast, LoopVectorization.OperationStruct(0x0000000000000132, 0x0000000000000003, 0x0000000000000000, 0x0000000000030405, LoopVectorization.compute, 0x00, 0x05), :LoopVectorization, :reduce_to_add, LoopVectorization.OperationStruct(0x0000000000000012, 0x0000000000000003, 0x0000000000000000, 0x0000000000000601, LoopVectorization.compute, 0x00, 0x01)}, Tuple{LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102, 0xffffffffffffe03b), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000103, 0xffffffffffffffd6), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000302, 0xffffffffffffe056), LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102, 0xffffffffffffffd6)}, Tuple{0, Tuple{}, Tuple{}, Tuple{}, Tuple{}, Tuple{(1, LoopVectorization.IntOrFloat), (5, LoopVectorization.IntOrFloat)}, Tuple{}}, (LoopVectorization.StaticLowerUnitRange{0}(M), LoopVectorization.StaticLowerUnitRange{0}(N), LoopVectorization.StaticLowerUnitRange{0}(K)), var"##vptr##_C", var"##vptr##_A", var"##vptr##_B", var"##vptr##_C")), :C, :A, :B))
+    end
+end
+```
+This summary is then [reconstruced](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/reconstruct_loopset.jl) using the available type information. This type information can be used, for example, to realize an array has been tranposed, and thus correctly identify which axis contains contiguous elements that are efficient to load from. This is why 
+The three chief components of the summaries are the definitions of operations, e.g.:
+```julia
+:LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000013, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x03)
+```
+the referenced array objects:
+```julia
+LoopVectorization.ArrayRefStruct(0x0000000000000101, 0x0000000000000102, 0xffffffffffffe03b)
+```
+and the set of loop bounds:
+```julia
+(LoopVectorization.StaticLowerUnitRange{0}(M), LoopVectorization.StaticLowerUnitRange{0}(N), LoopVectorization.StaticLowerUnitRange{0}(K))
+```
+
+
+
+
diff --git a/docs/src/devdocs/evaluating_loops.md b/docs/src/devdocs/evaluating_loops.md
@@ -0,0 +1,6 @@
+
+## Determining the strategy for evaluating loops
+
+The heart of the optimizatizations performed by LoopVectorization are given in the [determinestrategy.jl](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/determinestrategy.jl) file utilizing instruction costs specified in [costs.jl](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/costs.jl).
+Essentially, it estimates the cost of different means of evaluating the loops. It iterates through the different possible loop orders, as well as considering which loops to unroll, and which to vectorize. It will consider unrolling 1 or 2 loops (but it could settle on unrolling by a factor of 1, i.e. not unrolling), and vectorizing 1.
+
diff --git a/docs/src/devdocs/loopset_structure.md b/docs/src/devdocs/loopset_structure.md
@@ -0,0 +1,63 @@
+
+## LoopSet Structure
+
+The loopsets define loops as a set of operations that depend on one another, and also on loops. Cycles are not allowed, making it a directed acyclic graph. Currently, only single return values are supported.
+Lets use a set of nested loops performing matrix multiplication as an example. We can create a naive `LoopSet` from an expression (naive due to being created without access to any type information):
+```julia
+julia> using LoopVectorization
+
+julia> AmulBq = :(for m ∈ 1:M, n ∈ 1:N
+           C[m,n] = zero(eltype(B))
+           for k ∈ 1:K
+               C[m,n] += A[m,k] * B[k,n]
+           end
+       end);
+
+julia> lsAmulB = LoopVectorization.LoopSet(AmulBq);
+```
+This LoopSet consists of seven operations that define the relationships within the loop:
+```julia
+julia> LoopVectorization.operations(lsAmulB)
+7-element Array{LoopVectorization.Operation,1}:
+ var"##RHS#256" = var"##zero#257"
+ C[m, n] = var"##RHS#256"
+ var"##tempload#258" = A[m, k]
+ var"##tempload#259" = B[k, n]
+ var"##reduction#260" = var"##reductzero#261"
+ var"##reduction#260" = LoopVectorization.vfmadd_fast(var"##tempload#258", var"##tempload#259", var"##reduction#260")
+ var"##RHS#256" = LoopVectorization.reduce_to_add(var"##reduction#260", var"##RHS#256")
+```
+The act of performing a "reduction" across a loop introduces a few extra operations that manage creating a "zero" with respect to the reduction, and then combining with the specified value using `reduce_to_add`, which performs any necessary type conversions, such as from an `SVec` vector-type to a scalar, if necessary. This simplifies code generation, by making the functions agnostic with respect to the actual vectorization decisions the library makes.
+
+Each operation is listed as depending on a set of loop iteration symbols:
+```julia
+julia> LoopVectorization.loopdependencies.(LoopVectorization.operations(lsAmulB))
+7-element Array{Array{Symbol,1},1}:
+ [:m, :n]
+ [:m, :n]
+ [:m, :k]
+ [:k, :n]
+ [:m, :n]
+ [:m, :k, :n]
+ [:m, :n]
+```
+We can also see which of the operations each of these operations depend on:
+```julia
+julia> LoopVectorization.operations(lsAmulB)[6]
+var"##reduction#260" = LoopVectorization.vfmadd_fast(var"##tempload#258", var"##tempload#259", var"##reduction#260")
+
+julia> LoopVectorization.parents(ans)
+3-element Array{LoopVectorization.Operation,1}:
+ var"##tempload#258" = A[m, k]
+ var"##tempload#259" = B[k, n]
+ var"##reduction#260" = var"##reductzero#261"
+ ```
+References to arrays are represtened with an `ArrayReferenceMeta` data structure:
+```julia
+julia> LoopVectorization.operations(lsAmulB)[3].ref
+LoopVectorization.ArrayReferenceMeta(LoopVectorization.ArrayReference(:A, [:m, :k], Int8[0, 0]), Bool[1, 1], Symbol("##vptr##_A"))
+```
+It contains the name of the parent array (`:A`), the indicies `[:m,:k]`, and a boolean vector (`Bool[1, 1]`) indicating whether these indices are loop iterables. Note that the optimizer assumes arrays are column-major, and thus that it is efficient to read contiguous elements from the first index. In lower level terms, it means that [high-throughput vmov](https://www.felixcloutier.com/x86/movupd) instructions can be used rather than [low-throughput](https://www.felixcloutier.com/x86/vgatherdpd:vgatherqpd) [gathers](https://www.felixcloutier.com/x86/vgatherqps:vgatherqpd). Similar story for storing elements.
+When no axis has unit stride, the first given index will be the dummy `Symbol("##DISCONTIGUOUSSUBARRAY##")`.
+
+
diff --git a/docs/src/devdocs/lowering.md b/docs/src/devdocs/lowering.md
@@ -0,0 +1,11 @@
+
+## Lowering
+
+The first step to lowering is picking a strategy for lowering the loops. Then a Julia expression is created following that strategy, converting each of the operations into Julia expressions.
+This task is made simpler via multiple dispatch making the lowering of the components independent of the larger picture. For example, a load will look like
+```julia
+vload(vptr_A, (i,j,k))
+```
+with the behavior of this load determined by the types of the arguments. Vectorization is expressed by making an index a `_MM{W}` type, rather than an integer, and operations with it will either produce another `_MM{W}` when it will still correspond to contiguous loads, or an `SVec{W,<:Integer}` if the resulting loads will be discontiguous, so that a `gather` or `scatter!` will be used. If all indexes are simply integers, then this produces a scalar load or store.
+
+
diff --git a/docs/src/devdocs/overview.md b/docs/src/devdocs/overview.md
@@ -0,0 +1,13 @@
+
+## Developer Overview
+
+Here I will try to explain how the library works for the curious or any would-be contributors.
+
+The library uses a [LoopSet](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/graphs.jl#L146) object to model loops. The key components of the library can be divided into:
+1. Defining the LoopSet objects.
+2. Constructing the LoopSet objects.
+3. Determining the strategy of how to evaluate loops.
+4. Lowering the loopset object into a Julia `Expr` following a strategy.
+
+
+
diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
@@ -60,7 +60,7 @@ julia> E1 = Array{Float64}(undef, M, K, N);
 
 julia> E2 = similar(E1);
 
-julia> @benchmark      @. $E1 = exp($A - $b' +    $d) * $c
+julia> @benchmark @. $E1 = exp($A - $b' +    $d) * $c
 BenchmarkTools.Trial: 
   memory estimate:  112 bytes
   allocs estimate:  5
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -13,7 +13,12 @@ Pages = [
     "examples/dot_product.md",
     "examples/sum_of_squared_error.md",
     "vectorized_convenience_functions.md",
-    "future_work.md"
+    "future_work.md",
+	"devdocs/overview.md",
+	"devdocs/loopset_structure.md",
+	"devdocs/constructing_loopsets.md",
+	"devdocs/evaluating_loops.md",
+	"devdocs/lowering.md"
 ]
 Depth = 1
 ```
diff --git a/docs/src/vectorized_convenience_functions.md b/docs/src/vectorized_convenience_functions.md
diff --git a/src/map.jl b/src/map.jl
diff --git a/test/printmethods.jl b/test/printmethods.jl