JuliaSIMD
diff --git a/‎Project.toml
Lines changed: 2 additions & 2 deletions b/‎Project.toml
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 12 additions & 12 deletions b/‎README.md
Lines changed: 12 additions & 12 deletions
diff --git a/‎benchmark/benchmarkflops.jl
Lines changed: 3 additions & 3 deletions b/‎benchmark/benchmarkflops.jl
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmark/looptests.jl
Lines changed: 22 additions & 22 deletions b/‎benchmark/looptests.jl
Lines changed: 22 additions & 22 deletions
diff --git a/‎docs/src/api.md
Lines changed: 1 addition & 1 deletion b/‎docs/src/api.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/src/devdocs/constructing_loopsets.md
Lines changed: 4 additions & 4 deletions b/‎docs/src/devdocs/constructing_loopsets.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/src/devdocs/reference.md
Lines changed: 1 addition & 1 deletion b/‎docs/src/devdocs/reference.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/src/examples/array_interface.md
Lines changed: 2 additions & 2 deletions b/‎docs/src/examples/array_interface.md
Lines changed: 2 additions & 2 deletions
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <[email protected]>"]
-version = "0.12.21"
+version = "0.12.22"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -27,7 +27,7 @@ OffsetArrays = "1.4.1"
 Requires = "1"
 SLEEFPirates = "0.6.18"
 Static = "0.2"
-StrideArraysCore = "0.1.5"
+StrideArraysCore = "0.1.11"
 ThreadingUtilities = "0.4.2"
 UnPack = "1"
 VectorizationBase = "0.20.4"
 
@@ -17,15 +17,15 @@ LoopVectorization is supported on Julia 1.1 and later. It is tested on Julia 1.5
 ## Warning
 
 Misusing LoopVectorization can have [serious consequences](http://catb.org/jargon/html/N/nasal-demons.html). Like `@inbounds`, misusing it can lead to segfaults and memory corruption.
-We expect that any time you use the `@avx` macro with a given block of code that you:
-1. Are not indexing an array out of bounds. `@avx` does not perform any bounds checking.
+We expect that any time you use the `@turbo` macro with a given block of code that you:
+1. Are not indexing an array out of bounds. `@turbo` does not perform any bounds checking.
 2. Are not iterating over an empty collection. Iterating over an empty loop such as `for i ∈ eachindex(Float64[])` is undefined behavior, and will likely result in the out of bounds memory accesses. Ensure that loops behave correctly.
-3. Are not relying on a specific execution order. `@avx` can and will re-order operations and loops inside its scope, so the correctness cannot depend on a particular order. You cannot implement `cumsum` with `@avx`.
+3. Are not relying on a specific execution order. `@turbo` can and will re-order operations and loops inside its scope, so the correctness cannot depend on a particular order. You cannot implement `cumsum` with `@turbo`.
 4. Are not using multiple loops at the same level in nested loops.
 
 ## Usage
 
-This library provides the `@avx` macro, which may be used to prefix a `for` loop or broadcast statement.
+This library provides the `@turbo` macro, which may be used to prefix a `for` loop or broadcast statement.
 It then tries to vectorize the loop to improve runtime performance.
 
 The macro assumes that loop iterations can be reordered. It also currently supports simple nested loops, where loop bounds of inner loops are constant across iterations of the outer loop, and only a single loop at each level of loop nest. These limitations should be removed in a future version.
@@ -60,7 +60,7 @@ mydot (generic function with 1 method)
 
 julia> function mydotavx(a, b)
            s = 0.0
-           @avx for i ∈ eachindex(a,b)
+           @turbo for i ∈ eachindex(a,b)
                s += a[i]*b[i]
            end
            s
@@ -111,7 +111,7 @@ julia> function mygemm!(C, A, B)
 mygemm! (generic function with 1 method)
 
 julia> function mygemmavx!(C, A, B)
-           @avx for m ∈ axes(A,1), n ∈ axes(B,2)
+           @turbo for m ∈ axes(A,1), n ∈ axes(B,2)
                Cmn = zero(eltype(C))
                for k ∈ axes(A,2)
                    Cmn += A[m,k] * B[k,n]
@@ -207,7 +207,7 @@ julia> A = rand(5,77); B = rand(77, 51); C = rand(51,49); D = rand(49,51);
 
 julia> X1 =      view(A,1,:) .+ B  *  (C .+ D');
 
-julia> X2 = @avx view(A,1,:) .+ B .*ˡ (C .+ D');
+julia> X2 = @turbo view(A,1,:) .+ B .*ˡ (C .+ D');
 
 julia> @test X1 ≈ X2
 Test Passed
@@ -219,7 +219,7 @@ julia> buf2 = similar(X1);
 julia> @btime $X1 .= view($A,1,:) .+ mul!($buf2, $B, ($buf1 .= $C .+ $D'));
   9.188 μs (0 allocations: 0 bytes)
 
-julia> @btime @avx $X2 .= view($A,1,:) .+ $B .*ˡ ($C .+ $D');
+julia> @btime @turbo $X2 .= view($A,1,:) .+ $B .*ˡ ($C .+ $D');
   6.751 μs (0 allocations: 0 bytes)
 
 julia> @test X1 ≈ X2
@@ -238,7 +238,7 @@ This may improve as the optimizations within LoopVectorization improve.
 Note that loops will be faster than broadcasting in general. This is because the behavior of broadcasts is determined by runtime information (i.e., dimensions other than the leading dimension of size `1` will be broadcasted; it is not known which these will be at compile time).
 ```julia
 julia> function AmulBtest!(C,A,Bk,Bn,d)
-          @avx for m ∈ axes(A,1), n ∈ axes(Bk,2)
+          @turbo for m ∈ axes(A,1), n ∈ axes(Bk,2)
              ΔCmn = zero(eltype(C))
              for k ∈ axes(A,2)
                 ΔCmn += A[m,k] * (Bk[k,n] + Bn[n,k])
@@ -276,7 +276,7 @@ BenchmarkTools.Trial:
  <summaryClick me! ></summary>
 <p>
 
-The key to the `@avx` macro's performance gains is leveraging knowledge of exactly how data like `Float64`s and `Int`s are handled by a CPU. As such, it is not strightforward to generalize the `@avx` macro to work on arrays containing structs such as `Matrix{Complex{Float64}}`. Instead, it is currently recommended that users wishing to apply `@avx` to arrays of structs use packages such as [StructArrays.jl](https://github.com/JuliaArrays/StructArrays.jl) which transform an array where each element is a struct into a struct where each element is an array. Using StructArrays.jl, we can write a matrix multiply (gemm) kernel that works on matrices of `Complex{Float64}`s and `Complex{Int}`s:
+The key to the `@turbo` macro's performance gains is leveraging knowledge of exactly how data like `Float64`s and `Int`s are handled by a CPU. As such, it is not strightforward to generalize the `@turbo` macro to work on arrays containing structs such as `Matrix{Complex{Float64}}`. Instead, it is currently recommended that users wishing to apply `@turbo` to arrays of structs use packages such as [StructArrays.jl](https://github.com/JuliaArrays/StructArrays.jl) which transform an array where each element is a struct into a struct where each element is an array. Using StructArrays.jl, we can write a matrix multiply (gemm) kernel that works on matrices of `Complex{Float64}`s and `Complex{Int}`s:
 ```julia 
 using LoopVectorization, LinearAlgebra, StructArrays, BenchmarkTools, Test
 
@@ -285,7 +285,7 @@ BLAS.set_num_threads(1); @show BLAS.vendor()
 const MatrixFInt64 = Union{Matrix{Float64}, Matrix{Int}}
 
 function mul_avx!(C::MatrixFInt64, A::MatrixFInt64, B::MatrixFInt64)
-    @avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
+    @turbo for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
         Cmn = zero(eltype(C))
         for k ∈ 1:size(A,2)
             Cmn += A[m,k] * B[k,n]
@@ -295,7 +295,7 @@ function mul_avx!(C::MatrixFInt64, A::MatrixFInt64, B::MatrixFInt64)
 end
 
 function mul_add_avx!(C::MatrixFInt64, A::MatrixFInt64, B::MatrixFInt64, factor=1)
-    @avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
+    @turbo for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
         ΔCmn = zero(eltype(C))
         for k ∈ 1:size(A,2)
             ΔCmn += A[m,k] * B[k,n]
 
@@ -275,7 +275,7 @@ end
 function exp_bench!(br, s, i)
     a = rand(s); b = similar(a)
     n_gflop = 1e-9*s # not really gflops
-    br[1,i] = n_gflop / @belapsed @avx @. $b = exp($a)
+    br[1,i] = n_gflop / @belapsed @turbo @. $b = exp($a)
     baseb = copy(b)
     br[2,i] = n_gflop / @belapsed @. $b = exp($a)
     @assert b ≈ baseb "LoopVec wrong?"
@@ -296,7 +296,7 @@ function aplusBc_bench!(br, s, i)
     a = rand(M); B = rand(M,N); c = rand(N);
     c′ = c'; D = similar(B)
     n_gflop = 2e-9 * M*N
-    br[1,i] = n_gflop / @belapsed @avx @. $D = $a + $B * $c′
+    br[1,i] = n_gflop / @belapsed @turbo @. $D = $a + $B * $c′
     Dcopy = copy(D); fill!(D, NaN);
     br[2,i] = n_gflop / @belapsed @. $D = $a + $B * $c′
     @assert D ≈ Dcopy "LoopVec wrong?"
@@ -319,7 +319,7 @@ end
 function AplusAt_bench!(br, s, i)
     A = rand(s,s); B = similar(A)
     n_gflop = 1e-9*s^2
-    br[1,i] = n_gflop / @belapsed @avx @. $B = $A + $A'
+    br[1,i] = n_gflop / @belapsed @turbo @. $B = $A + $A'
     baseB = copy(B); fill!(B, NaN);
     br[2,i] = n_gflop / @belapsed @. $B = $A + $A'
     @assert B ≈ baseB "LoopVec wrong?"
 
@@ -64,7 +64,7 @@ function jgemm!(𝐂, 𝐀ᵀ::Adjoint, 𝐁ᵀ::Adjoint)
     end
 end
 function gemmavx!(𝐂, 𝐀, 𝐁)
-    @avx for m ∈ indices((𝐀,𝐂),1), n ∈ indices((𝐁,𝐂),2)
+    @turbo for m ∈ indices((𝐀,𝐂),1), n ∈ indices((𝐁,𝐂),2)
         𝐂ₘₙ = zero(eltype(𝐂))
         for k ∈ indices((𝐀,𝐁),(2,1))
             𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
@@ -76,7 +76,7 @@ function gemmavx!(Cc::AbstractMatrix{Complex{T}}, Ac::AbstractMatrix{Complex{T}}
     A = reinterpret(reshape, T, Ac)
     B = reinterpret(reshape, T, Bc)
     C = reinterpret(reshape, T, Cc)
-    @avx for m ∈ indices((A,C),2), n ∈ indices((B,C),3)
+    @turbo for m ∈ indices((A,C),2), n ∈ indices((B,C),3)
         Cre = zero(T)
         Cim = zero(T)
         for k ∈ indices((A,B),(3,2))
@@ -88,7 +88,7 @@ function gemmavx!(Cc::AbstractMatrix{Complex{T}}, Ac::AbstractMatrix{Complex{T}}
     end
 end
 function gemmavxt!(𝐂, 𝐀, 𝐁)
-    @avxt for m ∈ indices((𝐀,𝐂),1), n ∈ indices((𝐁,𝐂),2)
+    @tturbo for m ∈ indices((𝐀,𝐂),1), n ∈ indices((𝐁,𝐂),2)
         𝐂ₘₙ = zero(eltype(𝐂))
         for k ∈ indices((𝐀,𝐁),(2,1))
             𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
@@ -100,7 +100,7 @@ function gemmavxt!(Cc::AbstractMatrix{Complex{T}}, Ac::AbstractMatrix{Complex{T}
     A = reinterpret(reshape, T, Ac)
     B = reinterpret(reshape, T, Bc)
     C = reinterpret(reshape, T, Cc)
-    @avxt for m ∈ indices((A,C),2), n ∈ indices((B,C),3)
+    @tturbo for m ∈ indices((A,C),2), n ∈ indices((B,C),3)
         Cre = zero(T)
         Cim = zero(T)
         for k ∈ indices((A,B),(3,2))
@@ -121,16 +121,16 @@ function jdot(a, b)
 end
 function jdotavx(a, b)
     s = zero(eltype(a))
-    # @avx for i ∈ eachindex(a,b)
-    @avx for i ∈ eachindex(a)
+    # @turbo for i ∈ eachindex(a,b)
+    @turbo for i ∈ eachindex(a)
         s += a[i] * b[i]
     end
     s
 end
 function jdotavxt(a, b)
     s = zero(eltype(a))
-    # @avx for i ∈ eachindex(a,b)
-    @avxt for i ∈ eachindex(a)
+    # @turbo for i ∈ eachindex(a,b)
+    @tturbo for i ∈ eachindex(a)
         s += a[i] * b[i]
     end
     s
@@ -144,7 +144,7 @@ function jselfdot(a)
 end
 function jselfdotavx(a)
     s = zero(eltype(a))
-    @avx for i ∈ eachindex(a)
+    @turbo for i ∈ eachindex(a)
         s += a[i] * a[i]
     end
     s
@@ -160,7 +160,7 @@ end
 function jdot3v2avx(x, A, y)
     M, N = size(A)
     s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
-    @avx for n ∈ 1:N, m ∈ 1:M
+    @turbo for n ∈ 1:N, m ∈ 1:M
         s += x[m] * A[m,n] * y[n]
     end
     s
@@ -178,7 +178,7 @@ function jdot3(x, A, y)
 end
 function jdot3avx(x, A, y)
     s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
-    @avx for n ∈ axes(A,2)
+    @turbo for n ∈ axes(A,2)
         t = zero(s)
         for m ∈ axes(A,1)
             t += x[m] * A[m,n]
@@ -193,7 +193,7 @@ function jvexp!(b, a)
     end
 end
 function jvexpavx!(b, a)
-    @avx for i ∈ eachindex(a)
+    @turbo for i ∈ eachindex(a)
         b[i] = exp(a[i])
     end
 end
@@ -206,7 +206,7 @@ function jsvexp(a)
 end
 function jsvexpavx(a)
     s = zero(eltype(a))
-    @avx for i ∈ eachindex(a)
+    @turbo for i ∈ eachindex(a)
         s += exp(a[i])
     end
     s
@@ -230,7 +230,7 @@ function jgemv!(𝐲, 𝐀ᵀ::Adjoint, 𝐱)
     end
 end
 function jgemvavx!(𝐲, 𝐀, 𝐱)
-    @avx for i ∈ eachindex(𝐲)
+    @turbo for i ∈ eachindex(𝐲)
         𝐲ᵢ = zero(eltype(𝐲))
         for j ∈ eachindex(𝐱)
             𝐲ᵢ += 𝐀[i,j] * 𝐱[j]
@@ -248,7 +248,7 @@ function jvar!(𝐬², 𝐀, x̄)
     end
 end
 function jvaravx!(𝐬², 𝐀, x̄)
-    @avx for j ∈ eachindex(𝐬²)
+    @turbo for j ∈ eachindex(𝐬²)
         𝐬²ⱼ = zero(eltype(𝐬²))
         x̄ⱼ = x̄[j]
         for i ∈ 1:size(𝐀,2)
@@ -259,7 +259,7 @@ function jvaravx!(𝐬², 𝐀, x̄)
     end
 end
 japlucBc!(D, a, B, c) =      @. D = a + B * c';
-japlucBcavx!(D, a, B, c) = @avx @. D = a + B * c';
+japlucBcavx!(D, a, B, c) = @turbo @. D = a + B * c';
 
 function jOLSlp(y, X, β)
     lp = zero(eltype(y))
@@ -274,7 +274,7 @@ function jOLSlp(y, X, β)
 end
 function jOLSlp_avx(y, X, β)
     lp = zero(eltype(y))
-    @avx for i ∈ eachindex(y)
+    @turbo for i ∈ eachindex(y)
         δ = y[i]
         for j ∈ eachindex(β)
             δ -= X[i,j] * β[j]
@@ -300,7 +300,7 @@ function randomaccessavx(P, basis, coeffs::Vector{T}) where {T}
     C = length(coeffs)
     A = size(P, 1)
     p = zero(T)
-    @avx for c ∈ 1:C
+    @turbo for c ∈ 1:C
         pc = coeffs[c]
         for a = 1:A
             pc *= P[a, basis[a, c]]
@@ -319,7 +319,7 @@ end
 function jlogdettriangleavx(B::Union{LowerTriangular,UpperTriangular})
     A = parent(B) # No longer supported
     ld = zero(eltype(A))
-    @avx for n ∈ axes(A,1)
+    @turbo for n ∈ axes(A,1)
         ld += log(A[n,n])
     end
     ld
@@ -339,7 +339,7 @@ function filter2d!(out::AbstractMatrix, A::AbstractMatrix, kern)
     out
 end
 function filter2davx!(out::AbstractMatrix, A::AbstractMatrix, kern)
-    @avx for J in CartesianIndices(out)
+    @turbo for J in CartesianIndices(out)
         tmp = zero(eltype(out))
         for I ∈ CartesianIndices(kern)
             tmp += A[I + J] * kern[I]
@@ -364,7 +364,7 @@ end
 function filter2dunrolledavx!(out::AbstractMatrix, A::AbstractMatrix, kern::SizedOffsetMatrix{T,-1,1,-1,1}) where {T}
     rng1,  rng2  = axes(out)
     Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> kern_ik_jk = kern[ik-2,jk-2]
-    @avx for j in rng2, i in rng1
+    @turbo for j in rng2, i in rng1
         tmp_0 = zero(eltype(out))
         Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> tmp_{ik+(jk-1)*3} = A[i+(ik-2),j+(jk-2)] * kern_ik_jk + tmp_{ik+(jk-1)*3-1}
         out[i,j] = tmp_9
@@ -379,7 +379,7 @@ end
 #     end
 # end
 # function smooth_line_avx!(sl,nrm1,j,i1,sl,rl,ih2,denom)
-#     @avx for i=i1:2:nrm1
+#     @turbo for i=i1:2:nrm1
 #         sl[i,j]=denom*(rl[i,j]+ih2*(sl[i,j-1]+sl[i-1,j]+sl[i+1,j]+sl[i,j+1]))
 #     end
 # end
 
@@ -3,7 +3,7 @@
 ## Macros
 
 ```@docs
-@avx
+@turbo
 @_avx
 ```
 
 
@@ -2,9 +2,9 @@
 
 ## Loop expressions
 
-When applying `@avx` to a loop expression, it creates a `LoopSet` without awareness to type information, and then [condenses the information](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/condense_loopset.jl) into a summary which is passed as type information to a generated function.
+When applying `@turbo` to a loop expression, it creates a `LoopSet` without awareness to type information, and then [condenses the information](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/condense_loopset.jl) into a summary which is passed as type information to a generated function.
 ```julia
-julia> @macroexpand @avx for m ∈ 1:M, n ∈ 1:N
+julia> @macroexpand @turbo for m ∈ 1:M, n ∈ 1:N
            C[m,n] = zero(eltype(B))
            for k ∈ 1:K
                C[m,n] += A[m,k] * B[k,n]
@@ -36,7 +36,7 @@ and the set of loop bounds:
 
 ## Broadcasting
 
-When applying the `@avx` macro to a broadcast expression, there are no explicit loops, and even the dimensionality of the operation is unknown.  Consequently the `LoopSet` object must be constructed at compile time. The function and involved operations are their relationships are straightforward to infer from the structure of nested broadcasts:
+When applying the `@turbo` macro to a broadcast expression, there are no explicit loops, and even the dimensionality of the operation is unknown.  Consequently the `LoopSet` object must be constructed at compile time. The function and involved operations are their relationships are straightforward to infer from the structure of nested broadcasts:
 ```julia
 julia> Meta.@lower @. f(g(a,b) + c) / d
 :($(Expr(:thunk, CodeInfo(
@@ -49,7 +49,7 @@ julia> Meta.@lower @. f(g(a,b) + c) / d
 └──      return %5
 ))))
 
-julia> @macroexpand @avx @. f(g(a,b) + c) / d
+julia> @macroexpand @turbo @. f(g(a,b) + c) / d
 quote
     var"##262" = Base.broadcasted(g, a, b)
     var"##263" = Base.broadcasted(+, var"##262", c)
 
@@ -33,7 +33,7 @@ LoopVectorization.ArrayReferenceMeta
 
 ## Condensed types
 
-These are used when encoding the `@avx` block as a type parameter for passing through
+These are used when encoding the `@turbo` block as a type parameter for passing through
 to the `@generated` function.
 
 ```@docs
 
@@ -14,7 +14,7 @@ By supporting the interface, using `LoopVectorization` can simplify implementing
 using StaticArrays, LoopVectorization
 
 @inline function AmulB!(C, A, B)
-    @avx for n ∈ axes(C,2), m ∈ axes(C,1)
+    @turbo for n ∈ axes(C,2), m ∈ axes(C,1)
         Cmn = zero(eltype(C))
         for k ∈ axes(B,1)
             Cmn += A[m,k] * B[k,n]
@@ -93,7 +93,7 @@ C_hybrid = HybridArray{Tuple{StaticArrays.Dynamic(),StaticArrays.Dynamic(),3,3}}
 # A is M x K x I x L
 # B is K x N x L x J
 function bmul!(C, A, B)
-    @avx for n in axes(C,2), m in axes(C,1), j in axes(C,4), i in axes(C,3)
+    @turbo for n in axes(C,2), m in axes(C,1), j in axes(C,4), i in axes(C,3)
         Cmnji = zero(eltype(C))
         for k in axes(B,1), l in axes(B,3)
             Cmnji += A[m,k,i,l] * B[k,n,l,j]