JuliaSIMD
diff --git a/‎.github/workflows/ci-julia-nightly.yml
Lines changed: 10 additions & 0 deletions b/‎.github/workflows/ci-julia-nightly.yml
Lines changed: 10 additions & 0 deletions
diff --git a/‎.github/workflows/ci.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 6 additions & 0 deletions b/‎.gitignore
Lines changed: 6 additions & 0 deletions
diff --git a/‎Project.toml
Lines changed: 9 additions & 5 deletions b/‎Project.toml
Lines changed: 9 additions & 5 deletions
diff --git a/‎README.md
Lines changed: 7 additions & 8 deletions b/‎README.md
Lines changed: 7 additions & 8 deletions
diff --git a/‎benchmark/driver.jl
Lines changed: 19 additions & 18 deletions b/‎benchmark/driver.jl
Lines changed: 19 additions & 18 deletions
diff --git a/‎benchmark/looptests.jl
Lines changed: 64 additions & 2 deletions b/‎benchmark/looptests.jl
Lines changed: 64 additions & 2 deletions
diff --git a/‎benchmark/openmp.c
Lines changed: 58 additions & 0 deletions b/‎benchmark/openmp.c
Lines changed: 58 additions & 0 deletions
diff --git a/‎docs/make.jl
Lines changed: 3 additions & 2 deletions b/‎docs/make.jl
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/src/assets/bench_AmulB_v2.png
-17.7 KB b/‎docs/src/assets/bench_AmulB_v2.png
-17.7 KB
@@ -3,9 +3,19 @@ on:
   pull_request:
     branches:
       - master
+    paths-ignore:
+      - 'LICENSE.md'
+      - 'README.md'
+      - 'utils/*'
+      - '.github/workflows/TagBot.yml'
   push:
     branches:
       - master
+    paths-ignore:
+      - 'LICENSE.md'
+      - 'README.md'
+      - 'utils/*'
+      - '.github/workflows/TagBot.yml'
     tags: '*'
 jobs:
   test-julia-nightly:
 
@@ -6,13 +6,15 @@ on:
     paths-ignore:
       - 'LICENSE.md'
       - 'README.md'
+      - 'utils/*'
       - '.github/workflows/TagBot.yml'
   push:
     branches:
       - master
     paths-ignore:
       - 'LICENSE.md'
       - 'README.md'
+      - 'utils/*'
       - '.github/workflows/TagBot.yml'
     tags: '*'
 jobs:
 
@@ -12,3 +12,9 @@
 *.s
 *#
 *.jld2
+Manifest.toml
+test/Manifest.toml
+test/*#*
+*#*
+
+
@@ -1,30 +1,34 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <[email protected]>"]
-version = "0.11.2"
+version = "0.12.0"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+CheapThreads = "b630d9fa-e28e-4980-896d-83ce5e2106b2"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
+Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
-ArrayInterface = "3"
+ArrayInterface = "3.1.4"
+CheapThreads = "0.1.2"
 DocStringExtensions = "0.8"
 IfElse = "0.1"
 OffsetArrays = "1.4.1, 1.5"
 Requires = "1"
-SLEEFPirates = "0.6.7"
-ThreadingUtilities = "0.2.3"
+SLEEFPirates = "0.6.12"
+Static = "0.2"
+ThreadingUtilities = "0.4"
 UnPack = "1"
-VectorizationBase = "0.18.1,0.19"
+VectorizationBase = "0.19.8"
 julia = "1.5"
 
 [extras]
 
@@ -1,10 +1,10 @@
 # LoopVectorization
 
-[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://chriselrod.github.io/LoopVectorization.jl/stable)
-[![Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://chriselrod.github.io/LoopVectorization.jl/latest)
-[![CI](https://github.com/chriselrod/LoopVectorization.jl/workflows/CI/badge.svg)](https://github.com/chriselrod/LoopVectorization.jl/actions?query=workflow%3ACI)
-[![CI (Julia nightly)](https://github.com/chriselrod/LoopVectorization.jl/workflows/CI%20(Julia%20nightly)/badge.svg)](https://github.com/chriselrod/LoopVectorization.jl/actions?query=workflow%3A%22CI+%28Julia+nightly%29%22)
-[![Codecov](https://codecov.io/gh/chriselrod/LoopVectorization.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/chriselrod/LoopVectorization.jl)
+[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://JuliaSIMD.github.io/LoopVectorization.jl/stable)
+[![Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://JuliaSIMD.github.io/LoopVectorization.jl/latest)
+[![CI](https://github.com/JuliaSIMD/LoopVectorization.jl/workflows/CI/badge.svg)](https://github.com/JuliaSIMD/LoopVectorization.jl/actions?query=workflow%3ACI)
+[![CI (Julia nightly)](https://github.com/JuliaSIMD/LoopVectorization.jl/workflows/CI%20(Julia%20nightly)/badge.svg)](https://github.com/JuliaSIMD/LoopVectorization.jl/actions?query=workflow%3A%22CI+%28Julia+nightly%29%22)
+[![Codecov](https://codecov.io/gh/JuliaSIMD/LoopVectorization.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaSIMD/LoopVectorization.jl)
 
 ## Installation
 
@@ -21,7 +21,6 @@ We expect that any time you use the `@avx` macro with a given block of code that
 1. Are not indexing an array out of bounds. `@avx` does not perform any bounds checking.
 2. Are not iterating over an empty collection. Iterating over an empty loop such as `for i ∈ eachindex(Float64[])` is undefined behavior, and will likely result in the out of bounds memory accesses. Ensure that loops behave correctly.
 3. Are not relying on a specific execution order. `@avx` can and will re-order operations and loops inside its scope, so the correctness cannot depend on a particular order. You cannot implement `cumsum` with `@avx`.
-4. Loops increment by 1 on each iteration, e.g. `1:2:N` is not supported at the moment. (This requirement will eventually be lifted.)
 
 ## Usage
 
@@ -39,7 +38,7 @@ Please see the documentation for benchmarks versus base Julia, Clang, icc, ifort
 
 LLVM/Julia by default generate essentially optimal code for a primary vectorized part of this loop. In many cases -- such as the dot product -- this vectorized part of the loop computes 4*SIMD-vector-width iterations at a time.
 On the CPU I'm running these benchmarks on with `Float64` data, the SIMD-vector-width is 8, meaning it will compute 32 iterations at a time.
-However, LLVM is very slow at handling the tails, `length(iterations) % 32`. For this reason, [in benchmark plots](https://chriselrod.github.io/LoopVectorization.jl/latest/examples/dot_product/) you can see performance drop as the size of the remainder increases.
+However, LLVM is very slow at handling the tails, `length(iterations) % 32`. For this reason, [in benchmark plots](https://JuliaSIMD.github.io/LoopVectorization.jl/latest/examples/dot_product/) you can see performance drop as the size of the remainder increases.
 
 For simple loops like a dot product, LoopVectorization.jl's most important optimization is to handle these tails more efficiently:
 <details>
@@ -346,7 +345,7 @@ Similar approaches can be taken to make kernels working with a variety of numeri
 * [Gaius.jl](https://github.com/MasonProtter/Gaius.jl)
 * [MaBLAS.jl](https://github.com/YingboMa/MaBLAS.jl)
 * [Octavian.jl](https://github.com/JuliaLinearAlgebra/Octavian.jl)
-* [PaddedMatrices.jl](https://github.com/chriselrod/PaddedMatrices.jl)
+* [PaddedMatrices.jl](https://github.com/JuliaSIMD/PaddedMatrices.jl)
 * [RecursiveFactorization.jl](https://github.com/YingboMa/RecursiveFactorization.jl)
 * [SnpArrays.jl](https://github.com/OpenMendel/SnpArrays.jl)
 * [Tullio.jl](https://github.com/mcabbott/Tullio.jl)
 
@@ -196,30 +196,30 @@ end
 sizes = 256:-1:2
 longsizes = 1024:-1:2
 
-logdettriangle_bench = benchmark_logdettriangle(sizes); println("logdet(LowerTriangular(A)) benchmark results:"); println(logdettriangle_bench)
-dot3_bench = benchmark_dot3(sizes); println("x' * A * y benchmark results:"); println(dot3_bench)
+println("logdet(LowerTriangular(A)) benchmark results:"); logdettriangle_bench = benchmark_logdettriangle(sizes); println(logdettriangle_bench)
+println("x' * A * y benchmark results:"); dot3_bench = benchmark_dot3(sizes); println(dot3_bench)
 
-AmulB_bench = benchmark_AmulB(sizes); println("A * B benchmark results:"); println(AmulB_bench)
-AmulBt_bench = benchmark_AmulBt(sizes); println("A * B' benchmark results:"); println(AmulBt_bench)
-AtmulBt_bench = benchmark_AtmulBt(sizes); println("A' * B' benchmark results:"); println(AtmulBt_bench)
-AtmulB_bench = benchmark_AtmulB(sizes); println("A' * B benchmark results:"); println(AtmulB_bench)
+println("A * B benchmark results:"); AmulB_bench = benchmark_AmulB(sizes); println(AmulB_bench)
+println("A * B' benchmark results:"); AmulBt_bench = benchmark_AmulBt(sizes); println(AmulBt_bench)
+println("A' * B' benchmark results:"); AtmulBt_bench = benchmark_AtmulBt(sizes); println(AtmulBt_bench)
+println("A' * B benchmark results:"); AtmulB_bench = benchmark_AtmulB(sizes); println(AtmulB_bench)
 
-Amulvb_bench = benchmark_Amulvb(sizes); println("A * b benchmark results:"); println(Amulvb_bench)
-Atmulvb_bench = benchmark_Atmulvb(sizes); println("A' * b benchmark results:"); println(Atmulvb_bench)
+println("A * b benchmark results:"); Amulvb_bench = benchmark_Amulvb(sizes); println(Amulvb_bench)
+println("A' * b benchmark results:"); Atmulvb_bench = benchmark_Atmulvb(sizes); println(Atmulvb_bench)
 
-dot_bench = benchmark_dot(longsizes); println("a' * b benchmark results:"); println(dot_bench)
-selfdot_bench = benchmark_selfdot(longsizes); println("a' * a benchmark results:"); println(selfdot_bench)
+println("a' * b benchmark results:"); dot_bench = benchmark_dot(longsizes); println(dot_bench)
+println("a' * a benchmark results:"); selfdot_bench = benchmark_selfdot(longsizes); println(selfdot_bench)
 
-sse_bench = benchmark_sse(sizes); println("Benchmark resutls of summing squared error:"); println(sse_bench)
-aplusBc_bench = benchmark_aplusBc(sizes); println("Benchmark results of a .+ B .* c':"); println(aplusBc_bench)
-AplusAt_bench = benchmark_AplusAt(sizes); println("Benchmark results of A .+ A':"); println(AplusAt_bench)
+println("Benchmark resutls of summing squared error:"); sse_bench = benchmark_sse(sizes); println(sse_bench)
+println("Benchmark results of a .+ B .* c':"); aplusBc_bench = benchmark_aplusBc(sizes); println(aplusBc_bench)
+println("Benchmark results of A .+ A':"); AplusAt_bench = benchmark_AplusAt(sizes); println(AplusAt_bench)
 
-filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes); println("Benchmark results for dynamically sized 3x3 convolution:"); println(filter2d_dynamic_bench)
-filter2d_3x3_bench = benchmark_filter2d3x3(sizes); println("Benchmark results for statically sized 3x3 convolution:"); println(filter2d_3x3_bench)
-filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes); println("Benchmark results for unrolled 3x3 convolution:"); println(filter2d_unrolled_bench)
+println("Benchmark results for dynamically sized 3x3 convolution:"); filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes); println(filter2d_dynamic_bench)
+println("Benchmark results for statically sized 3x3 convolution:"); filter2d_3x3_bench = benchmark_filter2d3x3(sizes); println(filter2d_3x3_bench)
+println("Benchmark results for unrolled 3x3 convolution:"); filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes); println(filter2d_unrolled_bench)
 
-vexp_bench = benchmark_exp(sizes); println("Benchmark results of exponentiating a vector:"); println(vexp_bench)
-randomaccess_bench = benchmark_random_access(sizes); println("Benchmark results from using a vector of indices:"); println(randomaccess_bench)
+println("Benchmark results of exponentiating a vector:"); vexp_bench = benchmark_exp(sizes); println(vexp_bench)
+println("Benchmark results from using a vector of indices:"); randomaccess_bench = benchmark_random_access(sizes); println(randomaccess_bench)
 
 const v = 2
 using Cairo, Fontconfig
@@ -242,6 +242,7 @@ saveplot("bench_AtmulBt_v", AtmulBt_bench);
 saveplot("bench_Amulvb_v", Amulvb_bench);
 saveplot("bench_Atmulvb_v", Atmulvb_bench);
 
+
 saveplot("bench_logdettriangle_v", logdettriangle_bench);
 saveplot("bench_filter2d_dynamic_v", filter2d_dynamic_bench);
 saveplot("bench_filter2d_3x3_v", filter2d_3x3_bench);
 
@@ -64,14 +64,53 @@ function jgemm!(𝐂, 𝐀ᵀ::Adjoint, 𝐁ᵀ::Adjoint)
     end
 end
 function gemmavx!(𝐂, 𝐀, 𝐁)
-    @avx for m ∈ axes(𝐀,1), n ∈ axes(𝐁,2)
+    @avx for m ∈ indices((𝐀,𝐂),1), n ∈ indices((𝐁,𝐂),2)
         𝐂ₘₙ = zero(eltype(𝐂))
-        for k ∈ axes(𝐀,2)
+        for k ∈ indices((𝐀,𝐁),(2,1))
             𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
         end
         𝐂[m,n] = 𝐂ₘₙ
     end
 end
+function gemmavx!(Cc::AbstractMatrix{Complex{T}}, Ac::AbstractMatrix{Complex{T}}, Bc::AbstractMatrix{Complex{T}}) where {T}
+    A = reinterpret(reshape, T, Ac)
+    B = reinterpret(reshape, T, Bc)
+    C = reinterpret(reshape, T, Cc)
+    @avx for m ∈ indices((A,C),2), n ∈ indices((B,C),3)
+        Cre = zero(T)
+        Cim = zero(T)
+        for k ∈ indices((A,B),(3,2))
+            Cre += A[1,m,k]*B[1,k,n] - A[2,m,k]*B[2,k,n]
+            Cim += A[1,m,k]*B[2,k,n] + A[2,m,k]*B[1,k,n]
+        end
+        C[1,m,n] = Cre
+        C[2,m,n] = Cim
+    end
+end
+function gemmavxt!(𝐂, 𝐀, 𝐁)
+    @avxt for m ∈ indices((𝐀,𝐂),1), n ∈ indices((𝐁,𝐂),2)
+        𝐂ₘₙ = zero(eltype(𝐂))
+        for k ∈ indices((𝐀,𝐁),(2,1))
+            𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
+        end
+        𝐂[m,n] = 𝐂ₘₙ
+    end
+end
+function gemmavxt!(Cc::AbstractMatrix{Complex{T}}, Ac::AbstractMatrix{Complex{T}}, Bc::AbstractMatrix{Complex{T}}) where {T}
+    A = reinterpret(reshape, T, Ac)
+    B = reinterpret(reshape, T, Bc)
+    C = reinterpret(reshape, T, Cc)
+    @avxt for m ∈ indices((A,C),2), n ∈ indices((B,C),3)
+        Cre = zero(T)
+        Cim = zero(T)
+        for k ∈ indices((A,B),(3,2))
+            Cre += A[1,m,k]*B[1,k,n] - A[2,m,k]*B[2,k,n]
+            Cim += A[1,m,k]*B[2,k,n] + A[2,m,k]*B[1,k,n]
+        end
+        C[1,m,n] = Cre
+        C[2,m,n] = Cim
+    end
+end
 function jdot(a, b)
     s = zero(eltype(a))
     # @inbounds @simd ivdep for i ∈ eachindex(a,b)
@@ -88,6 +127,14 @@ function jdotavx(a, b)
     end
     s
 end
+function jdotavxt(a, b)
+    s = zero(eltype(a))
+    # @avx for i ∈ eachindex(a,b)
+    @avxt for i ∈ eachindex(a)
+        s += a[i] * b[i]
+    end
+    s
+end
 function jselfdot(a)
     s = zero(eltype(a))
     @inbounds @simd ivdep for i ∈ eachindex(a)
@@ -324,3 +371,18 @@ function filter2dunrolledavx!(out::AbstractMatrix, A::AbstractMatrix, kern::Size
     end
     out
 end
+
+
+# function smooth_line!(sl,nrm1,j,i1,rl,ih2,denom)
+#     @fastmath @inbounds @simd ivdep for i=i1:2:nrm1
+#         sl[i,j]=denom*(rl[i,j]+ih2*(sl[i,j-1]+sl[i-1,j]+sl[i+1,j]+sl[i,j+1]))
+#     end
+# end
+# function smooth_line_avx!(sl,nrm1,j,i1,sl,rl,ih2,denom)
+#     @avx for i=i1:2:nrm1
+#         sl[i,j]=denom*(rl[i,j]+ih2*(sl[i,j-1]+sl[i-1,j]+sl[i+1,j]+sl[i,j+1]))
+#     end
+# end
+
+
+
@@ -0,0 +1,58 @@
+#include<omp.h>
+
+double dot(double* a, double* b, long N){
+  double s = 0.0;
+  #pragma omp parallel for reduction(+: s)
+  for(long n = 0; n < N; n++){
+    s += a[n]*b[n];
+  }
+  return s;
+}
+
+void cdot(double* c, double* a, double* b, long N){
+  double r = 0.0, i = 0.0;
+  #pragma omp parallel for reduction(+: r, i)
+  for(long n = 0; n < N; n++){
+    r += a[2*n] * b[2*n  ] + a[2*n+1] * b[2*n+1];
+    i += a[2*n] * b[2*n+1] - a[2*n+1] * b[2*n  ];
+  }
+  c[0] = r;
+  c[1] = i;
+  return;
+}
+
+void cdot3(double* c, double* x, double* A, double* y, long M, long N){
+  double sr = 0.0, si = 0.0;
+#pragma omp parallel for reduction(+: sr, si)
+  for (long n = 0; n < N; n++){
+    double tr = 0.0, ti = 0.0;
+    for(long m = 0; m < M; m++){
+      tr += x[2*m] * A[2*m   + 2*n*N] + x[2*m+1] * A[2*m+1 + 2*n*N];
+      ti += x[2*m] * A[2*m+1 + 2*n*N] - x[2*m+1] * A[2*m   + 2*n*N];
+    }
+    sr += tr * y[2*n  ] - ti * y[2*n+1];
+    si += tr * y[2*n+1] + ti * y[2*n  ];
+  }
+  c[0] = sr;
+  c[1] = si;
+  return;
+}
+
+void conv(double* B, double* A, double* K, long M, long N){
+  const long offset = 2;
+  #pragma omp parallel for collapse(2)
+  for (long i = offset; i < N-offset; i++){
+    for (long j = offset; j < M-offset; j++){
+      double tmp = 0.0;
+      for (long k = -offset; k < offset + 1; k++){
+	for (long l = -offset; l < offset + 1; l++){
+	  tmp += A[(j+l) + (i+k)*M] * K[(l+offset) + (k+offset)*(2*offset+1)];
+	}
+      }
+      B[(j-offset) + (i-offset) * (M-2*offset)] = tmp;
+    }
+  }
+  return;
+}
+
+
@@ -7,6 +7,7 @@ makedocs(;
         "Home" => "index.md",
         "Getting Started" => "getting_started.md",
         "Examples" => [
+            "examples/multithreading.md",
             "examples/matrix_multiplication.md",
             "examples/array_interface.md",
             "examples/matrix_vector_ops.md",
@@ -27,12 +28,12 @@ makedocs(;
             "devdocs/reference.md"
         ]
     ],
-    # repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",
+    # repo="https://github.com/JuliaSIMD/LoopVectorization.jl/blob/{commit}{path}#L{line}",
     sitename="LoopVectorization.jl",
     authors="Chris Elrod"
     # assets=[],
 )
 
 deploydocs(;
-    repo="github.com/chriselrod/LoopVectorization.jl",
+    repo="github.com/JuliaSIMD/LoopVectorization.jl",
 )