JuliaSIMD
diff --git a/‎.github/workflows/ci.yml
Lines changed: 68 additions & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 68 additions & 0 deletions
diff --git a/‎.travis.yml
Lines changed: 0 additions & 31 deletions b/‎.travis.yml
Lines changed: 0 additions & 31 deletions
diff --git a/‎Project.toml
Lines changed: 8 additions & 6 deletions b/‎Project.toml
Lines changed: 8 additions & 6 deletions
diff --git a/‎benchmark/looptests.jl
Lines changed: 16 additions & 17 deletions b/‎benchmark/looptests.jl
Lines changed: 16 additions & 17 deletions
diff --git a/‎docs/src/devdocs/loopset_structure.md
Lines changed: 1 addition & 1 deletion b/‎docs/src/devdocs/loopset_structure.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/src/devdocs/lowering.md
Lines changed: 1 addition & 1 deletion b/‎docs/src/devdocs/lowering.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/LoopVectorization.jl
Lines changed: 36 additions & 29 deletions b/‎src/LoopVectorization.jl
Lines changed: 36 additions & 29 deletions
diff --git a/‎src/add_compute.jl
Lines changed: 5 additions & 5 deletions b/‎src/add_compute.jl
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/add_ifelse.jl
Lines changed: 4 additions & 4 deletions b/‎src/add_ifelse.jl
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/add_stores.jl
Lines changed: 1 addition & 1 deletion b/‎src/add_stores.jl
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,68 @@
+name: CI
+on:
+  pull_request:
+    branches:
+      - master
+  push:
+    branches:
+      - master
+    tags: '*'
+jobs:
+  test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        version:
+          - '1.4'
+          - '1'
+          - 'nightly'
+        os:
+          - ubuntu-latest
+        arch:
+          - x64
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: actions/cache@v1
+        env:
+          cache-name: cache-artifacts
+        with:
+          path: ~/.julia/artifacts
+          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-test-${{ env.cache-name }}-
+            ${{ runner.os }}-test-
+            ${{ runner.os }}-
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+      - uses: julia-actions/julia-processcoverage@v1
+      - uses: codecov/codecov-action@v1
+        with:
+          file: lcov.info
+  docs:
+    name: Documentation
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: '1'
+      - run: |
+          julia --project=docs -e '
+            using Pkg
+            Pkg.develop(PackageSpec(path=pwd()))
+            Pkg.instantiate()'
+      - run: |
+          julia --project=docs -e '
+            using Documenter: doctest
+            using LoopVectorization
+            doctest(LoopVectorization)'
+      - run: julia --project=docs docs/make.jl
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
@@ -1,25 +1,27 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <[email protected]>"]
-version = "0.8.26"
+version = "0.9.0"
 
 [deps]
+ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
-SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a"
 SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
+ArrayInterface = "2.14"
 DocStringExtensions = "0.8"
+IfElse = "0"
 OffsetArrays = "1"
-SIMDPirates = "0.8.25"
-SLEEFPirates = "0.5.4"
+SLEEFPirates = "0.6"
 UnPack = "0,1"
-VectorizationBase = "0.12.31"
-julia = "1.1"
+VectorizationBase = "0.13"
+julia = "1.4"
 
 [extras]
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
@@ -1,25 +1,24 @@
-using LoopVectorization, LinearAlgebra, OffsetArrays
+using LoopVectorization, LinearAlgebra, OffsetArrays, ArrayInterface
 BLAS.set_num_threads(1)
 
-using LoopVectorization.VectorizationBase: StaticUnitRange
-struct SizedOffsetMatrix{T,LR,UR,LC,RC} <: DenseMatrix{T}
+# TODO: remove this once this PR merges: https://github.com/JuliaArrays/OffsetArrays.jl/pull/170
+@inline Base.unsafe_convert(::Type{Ptr{T}}, A::OffsetArray{T}) where {T} = pointer(parent(A))
+
+struct SizedOffsetMatrix{T,LR,UR,LC,UC} <: DenseMatrix{T}
     data::Matrix{T}
 end
-Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (StaticUnitRange{LR,UR}(),StaticUnitRange{LC,UC}())
+Base.size(::SizedOffsetMatrix{<:Any,LR,UR,LC,UC}) where {LR,UR,LC,UC} = (UR-LR+1,UC-LC+1)
+Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (Static{LR}():Static{UR}(),Static{LC}():Static{UC}())
 Base.parent(A::SizedOffsetMatrix) = A.data
-@generated function LoopVectorization.stridedpointer(A::SizedOffsetMatrix{T,LR,UR,LC,RC}) where {T,LR,UR,LC,RC}
-    quote
-        $(Expr(:meta,:inline))
-        LoopVectorization.OffsetStridedPointer(
-            LoopVectorization.StaticStridedPointer{$T,Tuple{1,$(UR-LR+1)}}(pointer(parent(A))),
-            ($(LR-1), $(LC-1))
-        )
-    end
-end
-Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i-1,j-1))
-Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (StaticUnitRange{LR,UR}(),StaticUnitRange{LC,UC}())
-Base.size(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (1 + UR-LR, 1 + UC-LC)
-Base.unsafe_convert(::Type{Ptr{Float64}}, A::SizedOffsetMatrix) = Base.unsafe_convert(Ptr{Float64}, A.data)
+Base.unsafe_convert(::Type{Ptr{T}}, A::SizedOffsetMatrix{T}) where {T} = pointer(A.data)
+ArrayInterface.contiguous_axis(::Type{<:SizedOffsetMatrix}) = ArrayInterface.Contiguous{1}()
+ArrayInterface.contiguous_batch_size(::Type{<:SizedOffsetMatrix}) = ArrayInterface.ContiguousBatch{0}()
+ArrayInterface.stride_rank(::Type{<:SizedOffsetMatrix}) = ArrayInterface.StrideRank{(1,2)}()
+function ArrayInterface.strides(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC}
+    (Static{1}(), (Static{UR}() - Static{LR}() + Static{1}()))
+end
+ArrayInterface.offsets(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (Static{LR}(), Static{LC}())
+Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i,j))
 
 
 function jgemm!(𝐂, 𝐀, 𝐁)
 
@@ -26,7 +26,7 @@ julia> LoopVectorization.operations(lsAmulB)
  var"##reduction#260" = LoopVectorization.vfmadd_fast(var"##tempload#258", var"##tempload#259", var"##reduction#260")
  var"##RHS#256" = LoopVectorization.reduce_to_add(var"##reduction#260", var"##RHS#256")
 ```
-The act of performing a "reduction" across a loop introduces a few extra operations that manage creating a "zero" with respect to the reduction, and then combining with the specified value using `reduce_to_add`, which performs any necessary type conversions, such as from an `SVec` vector-type to a scalar, if necessary. This simplifies code generation, by making the functions agnostic with respect to the actual vectorization decisions the library makes.
+The act of performing a "reduction" across a loop introduces a few extra operations that manage creating a "zero" with respect to the reduction, and then combining with the specified value using `reduce_to_add`, which performs any necessary type conversions, such as from an `Vec` vector-type to a scalar, if necessary. This simplifies code generation, by making the functions agnostic with respect to the actual vectorization decisions the library makes.
 
 Each operation is listed as depending on a set of loop iteration symbols:
 ```julia
 
@@ -5,6 +5,6 @@ This task is made simpler via multiple dispatch making the lowering of the compo
 ```julia
 vload(vptr_A, (i,j,k))
 ```
-with the behavior of this load determined by the types of the arguments. Vectorization is expressed by making an index a `_MM{W}` type, rather than an integer, and operations with it will either produce another `_MM{W}` when it will still correspond to contiguous loads, or an `SVec{W,<:Integer}` if the resulting loads will be discontiguous, so that a `gather` or `scatter!` will be used. If all indexes are simply integers, then this produces a scalar load or store.
+with the behavior of this load determined by the types of the arguments. Vectorization is expressed by making an index a `_MM{W}` type, rather than an integer, and operations with it will either produce another `_MM{W}` when it will still correspond to contiguous loads, or an `Vec{W,<:Integer}` if the resulting loads will be discontiguous, so that a `gather` or `scatter!` will be used. If all indexes are simply integers, then this produces a scalar load or store.
 
 
@@ -1,22 +1,28 @@
 module LoopVectorization
 
-if (!isnothing(get(ENV, "TRAVIS_BRANCH", nothing)) || !isnothing(get(ENV, "APPVEYOR", nothing))) && isdefined(Base, :Experimental) && isdefined(Base.Experimental, Symbol("@optlevel"))
-    @eval Base.Experimental.@optlevel 1
-end
-
-using VectorizationBase, SIMDPirates, SLEEFPirates, UnPack, OffsetArrays
-using VectorizationBase: REGISTER_SIZE, extract_data, num_vector_load_expr,
-    mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valmulsub, valadd, valsub, _MM,
-    maybestaticlength, maybestaticsize, staticm1, staticp1, staticmul, subsetview, vzero, stridedpointer_for_broadcast,
-    Static, Zero, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, unwrap, maybestaticrange,
-    AbstractColumnMajorStridedPointer, AbstractRowMajorStridedPointer, AbstractSparseStridedPointer, AbstractStaticStridedPointer,
-    PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct, offsetprecalc,
-    maybestaticfirst, maybestaticlast, scalar_less, scalar_greater, noalias!, gesp, gepbyte, pointerforcomparison, NativeTypes, staticmul, staticmuladd
-using SIMDPirates: VECTOR_SYMBOLS, evadd, evsub, evmul, evfdiv, vrange, 
-    reduced_add, reduced_prod, reduce_to_add, reduced_max, reduced_min, vsum, vprod, vmaximum, vminimum,
-    sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
-    vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, sizeequivalentfloat, sizeequivalentint, #prefetch,
-    vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone, vadd1, relu
+# if (!isnothing(get(ENV, "TRAVIS_BRANCH", nothing)) || !isnothing(get(ENV, "APPVEYOR", nothing))) && isdefined(Base, :Experimental) && isdefined(Base.Experimental, Symbol("@optlevel"))
+    # @eval Base.Experimental.@optlevel 1
+# end
+
+using VectorizationBase, SLEEFPirates, UnPack, OffsetArrays
+using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, data,
+    mask, pick_vector_width_val, MM,
+    maybestaticlength, maybestaticsize, staticm1, staticp1, staticmul, vzero,
+    Zero, maybestaticrange, offsetprecalc, lazymul,
+    maybestaticfirst, maybestaticlast, scalar_less, gep, gesp, pointerforcomparison, NativeTypes,
+    vfmadd, vfmsub, vfnmadd, vfnmsub, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, vadd, vsub, vmul,
+    relu, stridedpointer, StridedPointer, AbstractStridedPointer,
+    reduced_add, reduced_prod, reduce_to_add, reduce_to_prod, reduced_max, reduced_min, reduce_to_max, reduce_to_min,
+    vsum, vprod, vmaximum, vminimum, vstorent!
+
+using IfElse: ifelse
+
+# missing: stridedpointer_for_broadcast, noalias!, gepbyte, 
+# using SIMDPirates: VECTOR_SYMBOLS, evadd, evsub, evmul, evfdiv, vrange, 
+#     reduced_add, reduced_prod, reduce_to_add, reduced_max, reduced_min, vsum, vprod, vmaximum, vminimum,
+#     sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
+#     vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, sizeequivalentfloat, sizeequivalentint, #prefetch,
+#     vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone, vadd1, relu
 using SLEEFPirates: pow
 using Base.Broadcast: Broadcasted, DefaultArrayStyle
 using LinearAlgebra: Adjoint, Transpose
@@ -26,27 +32,28 @@ import LinearAlgebra # for check_args
 
 using Base.FastMath: add_fast, sub_fast, mul_fast, div_fast
 
+using ArrayInterface
+using ArrayInterface: OptionallyStaticUnitRange, Zero, One
+const Static = ArrayInterface.StaticInt
+
+
+
 export LowDimArray, stridedpointer,
     @avx, @_avx, *ˡ, _avx_!,
     vmap, vmap!, vmapt, vmapt!, vmapnt, vmapnt!, vmapntt, vmapntt!,
     vfilter, vfilter!, vmapreduce, vreduce
 
+@inline unwrap(::Val{N}) where {N} = N
+@inline unwrap(::Static{N}) where {N} = N
+@inline unwrap(x) = x
+
 const VECTORWIDTHSYMBOL, ELTYPESYMBOL = Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##")
 
-"""
-REGISTER_COUNT defined in VectorizationBase is supposed to correspond to the actual number of floating point registers on the system.
-It is hardcoded into a file at build time.
-However, someone may have multiple builds of Julia on the same system, some 32-bit and some 64-bit (e.g., they use 64-bit primarilly,
-but keep a 32-bit build on hand to debug test failures on Appveyor's 32-bit build). Thus, we don't want REGISTER_COUNT to be hardcoded
-in such a fashion. 
-32-bit builds are limited to only 8 floating point registers, so we take care of that here.
-
-If you want good performance, DO NOT use a 32-bit build of Julia if you don't have to.
-"""
-const REGISTER_COUNT = Sys.ARCH === :i686 ? 8 : VectorizationBase.REGISTER_COUNT
 
+include("vectorizationbase_compat/contract_pass.jl")
+include("vectorizationbase_compat/subsetview.jl")
 include("getconstindexes.jl")
-include("vectorizationbase_extensions.jl")
+# include("vectorizationbase_extensions.jl")
 include("predicates.jl")
 include("map.jl")
 include("filter.jl")
 
@@ -328,9 +328,9 @@ function add_pow!(
         return add_compute!(ls, var, :^, [xop, pop], elementbytes)
     end
     if pint == -1
-        return add_compute!(ls, var, :vinv, [xop], elementbytes)
+        return add_compute!(ls, var, :inv, [xop], elementbytes)
     elseif pint < 0
-        xop = add_compute!(ls, gensym(:inverse), :vinv, [xop], elementbytes)
+        xop = add_compute!(ls, gensym(:inverse), :inv, [xop], elementbytes)
         pint = - pint
     end
     if pint == 0
@@ -340,22 +340,22 @@ function add_pow!(
     elseif pint == 1
         return add_compute!(ls, var, :identity, [xop], elementbytes)
     elseif pint == 2
-        return add_compute!(ls, var, :vabs2, [xop], elementbytes)
+        return add_compute!(ls, var, :abs2, [xop], elementbytes)
     end
 
     # Implementation from https://github.com/JuliaLang/julia/blob/a965580ba7fd0e8314001521df254e30d686afbf/base/intfuncs.jl#L216
     t = trailing_zeros(pint) + 1
     pint >>= t
     while (t -= 1) > 0
         varname = (iszero(pint) && isone(t)) ? var : gensym(:pbs)
-        xop = add_compute!(ls, varname, :vabs2, [xop], elementbytes)
+        xop = add_compute!(ls, varname, :abs2, [xop], elementbytes)
     end
     yop = xop
     while pint > 0
         t = trailing_zeros(pint) + 1
         pint >>= t
         while (t -= 1) >= 0
-            xop = add_compute!(ls, gensym(:pbs), :vabs2, [xop], elementbytes)
+            xop = add_compute!(ls, gensym(:pbs), :abs2, [xop], elementbytes)
         end
         yop = add_compute!(ls, iszero(pint) ? var : gensym(:pbs), :vmul, [xop, yop], elementbytes)
     end
 
@@ -35,13 +35,13 @@ function add_if!(ls::LoopSet, LHS::Symbol, RHS::Expr, elementbytes::Int, positio
     else
         falseop = getop(ls, iffalse, elementbytes)
     end
-    add_compute!(ls, LHS, :vifelse, [condop, trueop, falseop], elementbytes)
+    add_compute!(ls, LHS, :ifelse, [condop, trueop, falseop], elementbytes)
 end
 
 function add_andblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, elementbytes::Int, position::Int)
     if LHS isa Symbol
         altop = getop(ls, LHS, elementbytes)
-        return add_compute!(ls, LHS, :vifelse, [condop, rhsop, altop], elementbytes)
+        return add_compute!(ls, LHS, :ifelse, [condop, rhsop, altop], elementbytes)
     elseif LHS isa Expr && LHS.head === :ref
         return add_conditional_store!(ls, LHS, condop, rhsop, elementbytes)
     else
@@ -78,10 +78,10 @@ function add_orblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, ele
     negatedcondop = negateop!(ls, condop, elementbytes)
     if LHS isa Symbol
         altop = getop(ls, LHS, elementbytes)
-        # return add_compute!(ls, LHS, :vifelse, [condop, altop, rhsop], elementbytes)
+        # return add_compute!(ls, LHS, :ifelse, [condop, altop, rhsop], elementbytes)
         # Placing altop second seems to let LLVM fuse operations; but as of LLVM 9.0.1 it will not if altop is first
         # therefore, we negate the condition and switch order so that the altop is second.
-        return add_compute!(ls, LHS, :vifelse, [negatedcondop, rhsop, altop], elementbytes)
+        return add_compute!(ls, LHS, :ifelse, [negatedcondop, rhsop, altop], elementbytes)
     elseif LHS isa Expr && LHS.head === :ref
         # negatedcondop = add_compute!(ls, gensym(:negated_mask), :~, [condop], elementbytes)
         return add_conditional_store!(ls, LHS, negatedcondop, rhsop, elementbytes)
 
@@ -113,7 +113,7 @@ function add_conditional_store!(ls::LoopSet, LHS, condop::Operation, storeop::Op
     #         prevstore = getop(ls, id + 1)
     #         # @show prevstore prevstore.node_type, loopdependencies(prevstore)
     #         # @show operations(ls)
-    #         storeop = add_compute!(ls, gensym(:combinedstoreop), Instruction(:vifelse), [condop, storeop, first(parents(prevstore))], elementbytes)
+    #         storeop = add_compute!(ls, gensym(:combinedstoreop), Instruction(:ifelse), [condop, storeop, first(parents(prevstore))], elementbytes)
     #         storeparents = [storeop]
     #         storeinstr = if prevstore.instruction.instr === :conditionalstore!
     #             push!(storeparents, add_compute!(ls, gensym(:combinedmask), Instruction(:|), [condop, last(parents(prevstore))], elementbytes))