JuliaSIMD
diff --git a/‎Manifest.toml
Lines changed: 6 additions & 6 deletions b/‎Manifest.toml
Lines changed: 6 additions & 6 deletions
diff --git a/‎Project.toml
Lines changed: 4 additions & 4 deletions b/‎Project.toml
Lines changed: 4 additions & 4 deletions
diff --git a/‎README.md
Lines changed: 85 additions & 2 deletions b/‎README.md
Lines changed: 85 additions & 2 deletions
diff --git a/‎benchmarks/benchmarkflops.jl renamed to ‎benchmark/benchmarkflops.jl b/‎benchmarks/benchmarkflops.jl renamed to ‎benchmark/benchmarkflops.jl
diff --git a/‎benchmark/benchmarks.jl
Lines changed: 24 additions & 0 deletions b/‎benchmark/benchmarks.jl
Lines changed: 24 additions & 0 deletions
diff --git a/‎benchmarks/driver.jl renamed to ‎benchmark/driver.jl
Lines changed: 25 additions & 1 deletion b/‎benchmarks/driver.jl renamed to ‎benchmark/driver.jl
Lines changed: 25 additions & 1 deletion
diff --git a/‎benchmarks/loadsharedlibs.jl renamed to ‎benchmark/loadsharedlibs.jl b/‎benchmarks/loadsharedlibs.jl renamed to ‎benchmark/loadsharedlibs.jl
diff --git a/‎benchmarks/looptests.c renamed to ‎benchmark/looptests.c b/‎benchmarks/looptests.c renamed to ‎benchmark/looptests.c
diff --git a/‎benchmarks/looptests.f90 renamed to ‎benchmark/looptests.f90 b/‎benchmarks/looptests.f90 renamed to ‎benchmark/looptests.f90
diff --git a/‎benchmarks/looptests.jl renamed to ‎benchmark/looptests.jl b/‎benchmarks/looptests.jl renamed to ‎benchmark/looptests.jl
@@ -61,15 +61,15 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [[SIMDPirates]]
 deps = ["MacroTools", "VectorizationBase"]
-git-tree-sha1 = "6c6a77a41c846c08c61a0e556183a9c33b53e3d1"
+git-tree-sha1 = "c0f42ddb2645c54b8620979df5dc979c4742db59"
 uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
-version = "0.1.3"
+version = "0.1.4"
 
 [[SLEEFPirates]]
 deps = ["SIMDPirates", "VectorizationBase"]
-git-tree-sha1 = "1c5b6827da87a12bdb7a4c893f44c3adbce3389d"
+git-tree-sha1 = "547bcf7d30967d87d4c62b3fe5efdb0e57a6e436"
 uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
-version = "0.1.1"
+version = "0.1.2"
 
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
@@ -83,6 +83,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[VectorizationBase]]
 deps = ["CpuId", "LinearAlgebra"]
-git-tree-sha1 = "54f5ba672c7d684fb0312825721368e22354ecd5"
+git-tree-sha1 = "81c1b3171d93e64345d75a9f08d190a155e9f009"
 uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
-version = "0.1.5"
+version = "0.1.7"
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <[email protected]>"]
-version = "0.3.2"
+version = "0.3.5"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -14,9 +14,9 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 [compat]
 MacroTools = "0.5"
 Parameters = "0.12.0"
-SIMDPirates = "0.1.3"
-SLEEFPirates = "0.1.1"
-VectorizationBase = "0.1.5"
+SIMDPirates = "0.1.4"
+SLEEFPirates = "0.1.2"
+VectorizationBase = "0.1.7"
 julia = "1.3.0"
 
 [extras]
 
@@ -20,6 +20,12 @@ It then tries to vectorize the loop to improve runtime performance.
 
 The macro assumes that loop iterations can be reordered. It also currently supports simple nested loops, where loop bounds of inner loops are constant across iterations of the outer loop, and only a single loop at each level of noop lest. These limitations should be removed in a future version.
 
+## Examples
+### Dot Product
+<details>
+ <summaryClick me! ></summary>
+<p>
+
 A simple example with a single loop is the dot product:
 ```julia
 using LoopVectorization, BenchmarkTools
@@ -77,6 +83,13 @@ For this reason, we need to unroll the operation to run several independent inst
 
 Note that 14 and 12 nm Ryzen chips can only do 1 full width `fma` per clock cycle (and 2 loads), so they should see similar performance with the dot and selfdot. I haven't verified this, but would like to hear from anyone who can.
 
+</p>
+</details>
+
+### Matrix Multiply
+<details>
+ <summaryClick me! ></summary>
+<p>
 
 We can also vectorize fancier loops. A likely familiar example to dive into:
 ```julia
@@ -114,12 +127,19 @@ In the future, I would like it to also model the cost of memory movement in the
 
 Until then, performance will degrade rapidly compared to BLAS as the size of the matrices increase. The advantage of the `@avx` macro, however, is that it is general. Not every operation is supported by BLAS.
 
-For example, what if `A` were the outter product of two vectors?
+For example, what if `A` were the outer product of two vectors?
 <!-- ```julia -->
 
 
 <!-- ``` -->
 
+</p>
+</details>
+
+### Broadcasting
+<details>
+ <summaryClick me! ></summary>
+<p>
 
 Another example, a straightforward operation expressed well via broadcasting:
 ```julia
@@ -137,13 +157,76 @@ d2 = @avx @. a + B * c′;
 can be optimized in a similar manner to BLAS, albeit to a much smaller degree because the naive version already benefits from vectorization (unlike the naive BLAS).
 
 
-You can also use `\ast` for lazy matrix multiplication that can fuse with broadcasts. `.\ast` behaves similarly, espcaping the broadcast (it is not applied elementwise). This allows you to use `@.` and fuse all the loops, even if the arguments to `\ast` are themselves broadcasted objects. However, it will often be the case that creating an intermediary is faster. I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
+You can also use `*ˡ` (which is typed `*\^l`) for lazy matrix multiplication that can fuse with broadcasts. `.*ˡ` behaves similarly, espcaping the broadcast (it is not applied elementwise). This allows you to use `@.` and fuse all the loops, even if the arguments to `*ˡ` are themselves broadcasted objects. However, it will often be the case that creating an intermediary is faster. I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
 
 At small sizes, this can be fast.
 ```julia
 
 ```
 
+</p>
+</details>
+
+
+### Dealing with structs
+<details>
+ <summaryClick me! ></summary>
+<p>
+
+The key to the `@avx` macro's performance gains is leveraging knowledge of exactly how data like `Float64`s and `Int`s are handled by a CPU. As such, it is not strightforward to generalize the `@avx` macro to work on arrays containing structs such as `Matrix{Complex{Float64}}`. Instead, it is currently recommended that users wishing to apply `@avx` to arrays of structs use packages such as [StructArrays.jl](https://github.com/JuliaArrays/StructArrays.jl) which transform an array where each element is a struct into a struct where each element is an array. Using StructArrays.jl, we can write a matrix multiply (gemm) kernel that works on matrices of `Complex{Float64}`s and `Complex{Int}`s:
+```julia 
+using LoopVectorization, LinearAlgebra, StructArrays, BenchmarkTools, Test
+
+BLAS.set_num_threads(1); @show BLAS.vendor()
+
+const MatrixFInt64 = Union{Matrix{Float64}, Matrix{Int}}
 
+function mul_avx!(C::MatrixFInt64, A::MatrixFInt64, B::MatrixFInt64)
+    z = zero(eltype(C))
+    @avx for i ∈ 1:size(A,1), j ∈ 1:size(B,2)
+        Cᵢⱼ = z
+        for k ∈ 1:size(A,2)
+            Cᵢⱼ += A[i,k] * B[k,j]
+        end
+        C[i,j] = Cᵢⱼ
+    end
+end
+
+function mul_add_avx!(C::MatrixFInt64, A::MatrixFInt64, B::MatrixFInt64, factor=1)
+    z = zero(eltype(C))
+    @avx for i ∈ 1:size(A,1), j ∈ 1:size(B,2)
+        ΔCᵢⱼ = z
+        for k ∈ 1:size(A,2)
+            ΔCᵢⱼ += A[i,k] * B[k,j]
+        end
+        C[i,j] += factor * ΔCᵢⱼ
+    end
+end
+
+const StructMatrixComplexFInt64 = Union{StructArray{ComplexF64,2}, StructArray{Complex{Int},2}}
+
+function mul_avx!(C:: StructMatrixComplexFInt64, A::StructMatrixComplexFInt64, B::StructMatrixComplexFInt64)
+    mul_avx!(    C.re, A.re, B.re)     # C.re = A.re * B.re
+    mul_add_avx!(C.re, A.im, B.im, -1) # C.re = C.re - A.im * B.im
+    mul_avx!(    C.im, A.re, B.im)     # C.im = A.re * B.im
+    mul_add_avx!(C.im, A.im, B.re)     # C.im = C.im + A.im * B.re
+end
+```
+this `mul_avx!` kernel can now accept `StructArray` matrices of complex numbers and multiply them efficiently:
+```julia
+M, K, N = 50, 51, 52
+
+A  = StructArray(randn(ComplexF64, M, K)); 
+B  = StructArray(randn(ComplexF64, K, N));
+C1 = StructArray(Matrix{ComplexF64}(undef, M, N)); 
+C2 = collect(similar(C1));
+
+@btime mul_avx!($C1, $A, $B)
+@btime mul!(    $C2, $(collect(A)), $(collect(B))) # collect turns the StructArray into a regular Array
+@test C1 ≈ C2
+```
 
+Similar approaches can be taken to make kernels working with a variety of numeric struct types such as [dual numbers](https://github.com/JuliaDiff/DualNumbers.jl), [DoubleFloats](https://github.com/JuliaMath/DoubleFloats.jl), etc. 
 
+</p>
+</details>
@@ -0,0 +1,24 @@
+
+using BenchmarkTools
+
+const SUITE = BenchmarkGroup()
+SUITE["linalg"] = BenchmarkGroup(["matmul","dot"])
+
+include(joinpath(@__DIR__, "looptests.jl"))
+
+SUITE["linalg"]["matmul"] = BenchmarkGroup()
+SUITE["linalg"]["dot"] = BenchmarkGroup()
+for n ∈ 1:64
+    A = rand(n,n);
+    A′ = copy(A');
+    B = rand(n,n);
+    C = Matrix{Float64}(undef, n, n);
+    SUITE["linalg"]["matmul"]["AmulB", n] = @benchmarkable gemmavx!($C, $A, $B)
+    SUITE["linalg"]["matmul"]["A′mulB", n] = @benchmarkable jAtmulBavx!($C, $A′, $B)
+    x = rand(n); y = rand(n);
+    SUITE["linalg"]["dot"]["dot", n] = @benchmarkable jdotavx($x, $y)
+    SUITE["linalg"]["dot"]["selfdot", n] = @benchmarkable jselfdotavx($x)
+    SUITE["linalg"]["dot"]["dot3", n] = @benchmarkable jdot3avx($x, $A, $y)
+end
+
+
@@ -2,6 +2,12 @@
 # const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmarks")
 # includet(joinpath(LOOPVECBENCHDIR, "driver.jl"))
 
+pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
+const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmarks")
+include(joinpath(LOOPVECBENCHDIR, "benchmarkflops.jl"))
+include(joinpath(LOOPVECBENCHDIR, "plotbenchmarks.jl"))
+
+
 using Distributed
 
 addprocs(9);
@@ -33,5 +39,23 @@ exp_bench = fetch(exp_future)
 aplusBc_bench = fetch(aplusBc_future)
 
 
-include(joinpath(LOOPVECBENCHDIR, "plotbenchmarks.jl"))
+plot(gemm_bench)
+plot(AtmulB_bench)
+plot(dot_bench)
+plot(selfdot_bench)
+plot(gemv_bench)
+plot(dot3_bench)
+plot(sse_bench)
+plot(exp_bench)
+plot(aplusBc_bench)
+
+save(joinpath("~/Pictures", "bench_gemm_v3.png"), plot(gemm_bench));
+save(joinpath("~/Pictures", "bench_AtmulB_v3.png"), plot(AtmulB_bench));
+save(joinpath("~/Pictures", "bench_dot_v3.png"), plot(dot_bench));
+save(joinpath("~/Pictures", "bench_selfdot_v3.png"), plot(selfdot_bench));
+save(joinpath("~/Pictures", "bench_gemv_v3.png"), plot(gemv_bench));
+save(joinpath("~/Pictures", "bench_dot3_v3.png"), plot(dot3_bench));
+save(joinpath("~/Pictures", "bench_sse_v3.png"), plot(sse_bench));
+save(joinpath("~/Pictures", "bench_exp_v3.png"), plot(exp_bench));
+save(joinpath("~/Pictures", "bench_aplusBc_v3.png"), plot(aplusBc_bench));