JuliaSIMD
diff --git a/‎Project.toml
Lines changed: 3 additions & 3 deletions b/‎Project.toml
Lines changed: 3 additions & 3 deletions
diff --git a/‎README.md
Lines changed: 7 additions & 7 deletions b/‎README.md
Lines changed: 7 additions & 7 deletions
diff --git a/‎docs/make.jl
Lines changed: 2 additions & 2 deletions b/‎docs/make.jl
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/src/devdocs/constructing_loopsets.md
Lines changed: 3 additions & 3 deletions b/‎docs/src/devdocs/constructing_loopsets.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/src/devdocs/evaluating_loops.md
Lines changed: 1 addition & 1 deletion b/‎docs/src/devdocs/evaluating_loops.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/src/devdocs/overview.md
Lines changed: 1 addition & 1 deletion b/‎docs/src/devdocs/overview.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/src/examples/matrix_multiplication.md
Lines changed: 1 addition & 1 deletion b/‎docs/src/examples/matrix_multiplication.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/codegen/lower_compute.jl
Lines changed: 63 additions & 18 deletions b/‎src/codegen/lower_compute.jl
Lines changed: 63 additions & 18 deletions
@@ -19,16 +19,16 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 ArrayInterface = "3"
-CheapThreads = "0.1.1"
+CheapThreads = "0.1.2"
 DocStringExtensions = "0.8"
 IfElse = "0.1"
 OffsetArrays = "1.4.1, 1.5"
 Requires = "1"
 SLEEFPirates = "0.6.12"
 Static = "0.2"
-ThreadingUtilities = "0.3"
+ThreadingUtilities = "0.4"
 UnPack = "1"
-VectorizationBase = "0.19.6"
+VectorizationBase = "0.19.8"
 julia = "1.5"
 
 [extras]
 
@@ -1,10 +1,10 @@
 # LoopVectorization
 
-[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://chriselrod.github.io/LoopVectorization.jl/stable)
-[![Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://chriselrod.github.io/LoopVectorization.jl/latest)
-[![CI](https://github.com/chriselrod/LoopVectorization.jl/workflows/CI/badge.svg)](https://github.com/chriselrod/LoopVectorization.jl/actions?query=workflow%3ACI)
-[![CI (Julia nightly)](https://github.com/chriselrod/LoopVectorization.jl/workflows/CI%20(Julia%20nightly)/badge.svg)](https://github.com/chriselrod/LoopVectorization.jl/actions?query=workflow%3A%22CI+%28Julia+nightly%29%22)
-[![Codecov](https://codecov.io/gh/chriselrod/LoopVectorization.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/chriselrod/LoopVectorization.jl)
+[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://JuliaSIMD.github.io/LoopVectorization.jl/stable)
+[![Latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://JuliaSIMD.github.io/LoopVectorization.jl/latest)
+[![CI](https://github.com/JuliaSIMD/LoopVectorization.jl/workflows/CI/badge.svg)](https://github.com/JuliaSIMD/LoopVectorization.jl/actions?query=workflow%3ACI)
+[![CI (Julia nightly)](https://github.com/JuliaSIMD/LoopVectorization.jl/workflows/CI%20(Julia%20nightly)/badge.svg)](https://github.com/JuliaSIMD/LoopVectorization.jl/actions?query=workflow%3A%22CI+%28Julia+nightly%29%22)
+[![Codecov](https://codecov.io/gh/JuliaSIMD/LoopVectorization.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaSIMD/LoopVectorization.jl)
 
 ## Installation
 
@@ -39,7 +39,7 @@ Please see the documentation for benchmarks versus base Julia, Clang, icc, ifort
 
 LLVM/Julia by default generate essentially optimal code for a primary vectorized part of this loop. In many cases -- such as the dot product -- this vectorized part of the loop computes 4*SIMD-vector-width iterations at a time.
 On the CPU I'm running these benchmarks on with `Float64` data, the SIMD-vector-width is 8, meaning it will compute 32 iterations at a time.
-However, LLVM is very slow at handling the tails, `length(iterations) % 32`. For this reason, [in benchmark plots](https://chriselrod.github.io/LoopVectorization.jl/latest/examples/dot_product/) you can see performance drop as the size of the remainder increases.
+However, LLVM is very slow at handling the tails, `length(iterations) % 32`. For this reason, [in benchmark plots](https://JuliaSIMD.github.io/LoopVectorization.jl/latest/examples/dot_product/) you can see performance drop as the size of the remainder increases.
 
 For simple loops like a dot product, LoopVectorization.jl's most important optimization is to handle these tails more efficiently:
 <details>
@@ -346,7 +346,7 @@ Similar approaches can be taken to make kernels working with a variety of numeri
 * [Gaius.jl](https://github.com/MasonProtter/Gaius.jl)
 * [MaBLAS.jl](https://github.com/YingboMa/MaBLAS.jl)
 * [Octavian.jl](https://github.com/JuliaLinearAlgebra/Octavian.jl)
-* [PaddedMatrices.jl](https://github.com/chriselrod/PaddedMatrices.jl)
+* [PaddedMatrices.jl](https://github.com/JuliaSIMD/PaddedMatrices.jl)
 * [RecursiveFactorization.jl](https://github.com/YingboMa/RecursiveFactorization.jl)
 * [SnpArrays.jl](https://github.com/OpenMendel/SnpArrays.jl)
 * [Tullio.jl](https://github.com/mcabbott/Tullio.jl)
 
@@ -27,12 +27,12 @@ makedocs(;
             "devdocs/reference.md"
         ]
     ],
-    # repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",
+    # repo="https://github.com/JuliaSIMD/LoopVectorization.jl/blob/{commit}{path}#L{line}",
     sitename="LoopVectorization.jl",
     authors="Chris Elrod"
     # assets=[],
 )
 
 deploydocs(;
-    repo="github.com/chriselrod/LoopVectorization.jl",
+    repo="github.com/JuliaSIMD/LoopVectorization.jl",
 )
@@ -2,7 +2,7 @@
 
 ## Loop expressions
 
-When applying `@avx` to a loop expression, it creates a `LoopSet` without awareness to type information, and then [condenses the information](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/condense_loopset.jl) into a summary which is passed as type information to a generated function.
+When applying `@avx` to a loop expression, it creates a `LoopSet` without awareness to type information, and then [condenses the information](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/condense_loopset.jl) into a summary which is passed as type information to a generated function.
 ```julia
 julia> @macroexpand @avx for m ∈ 1:M, n ∈ 1:N
            C[m,n] = zero(eltype(B))
@@ -19,7 +19,7 @@ quote
     end
 end
 ```
-When the corresponding method gets compiled for specific type of `A`, `B`, and `C`, the call to the `@generated` function `_avx_!` get compiled. This causes the summary to be [reconstructed](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/reconstruct_loopset.jl) using the available type information. This type information can be used, for example, to realize an array has been transposed, and thus correctly identify which axis contains contiguous elements that are efficient to load from. This kind of information cannot be extracted from the raw expression, which is why these decisions are made when the method gets compiled for specific types via the `@generated` function `_avx_!`.
+When the corresponding method gets compiled for specific type of `A`, `B`, and `C`, the call to the `@generated` function `_avx_!` get compiled. This causes the summary to be [reconstructed](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/reconstruct_loopset.jl) using the available type information. This type information can be used, for example, to realize an array has been transposed, and thus correctly identify which axis contains contiguous elements that are efficient to load from. This kind of information cannot be extracted from the raw expression, which is why these decisions are made when the method gets compiled for specific types via the `@generated` function `_avx_!`.
 
 The three chief components of the summaries are the definitions of operations, e.g.:
 ```julia
@@ -58,4 +58,4 @@ quote
     var"##266" = LoopVectorization.vmaterialize(var"##265", Val{:Main}())
 end
 ```
-These nested broadcasted objects already express information very similar to what the `LoopSet` objects hold. The dimensionality of the objects provides the information on the associated loop dependencies, but again this information is available only when the method is compiled for specific types. The `@generated` function `vmaterialize` constructs the LoopSet by recursively evaluating [add_broadcast!](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/broadcast.jl#L166) on all the fields.
+These nested broadcasted objects already express information very similar to what the `LoopSet` objects hold. The dimensionality of the objects provides the information on the associated loop dependencies, but again this information is available only when the method is compiled for specific types. The `@generated` function `vmaterialize` constructs the LoopSet by recursively evaluating [add_broadcast!](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/broadcast.jl#L166) on all the fields.
@@ -1,6 +1,6 @@
 # Determining the strategy for evaluating loops
 
-The heart of the optimizatizations performed by LoopVectorization are given in the [determinestrategy.jl](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/determinestrategy.jl) file utilizing instruction costs specified in [costs.jl](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/costs.jl).
+The heart of the optimizatizations performed by LoopVectorization are given in the [determinestrategy.jl](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/determinestrategy.jl) file utilizing instruction costs specified in [costs.jl](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/costs.jl).
 Essentially, it estimates the cost of different means of evaluating the loops. It iterates through the different possible loop orders, as well as considering which loops to unroll, and which to vectorize. It will consider unrolling 1 or 2 loops (but it could settle on unrolling by a factor of 1, i.e. not unrolling), and vectorizing 1.
 
 The cost estimate is based on the costs of individual instructions and the number of times each one needs to be executed for the given strategy. The instruction cost can be broken into several components:
 
@@ -2,7 +2,7 @@
 
 Here I will try to explain how the library works for the curious or any would-be contributors.
 
-The library uses a [LoopSet](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/graphs.jl#L146) object to model loops. The key components of the library can be divided into:
+The library uses a [LoopSet](https://github.com/JuliaSIMD/LoopVectorization.jl/blob/master/src/graphs.jl#L146) object to model loops. The key components of the library can be divided into:
 1. Defining the LoopSet objects.
 2. Constructing the LoopSet objects.
 3. Determining the strategy of how to evaluate loops.
 
@@ -19,7 +19,7 @@ and this can handle all transposed/not-tranposed permutations. LoopVectorization
 Letting all three matrices be square and `Size` x `Size`, we attain the following benchmark results:
 
 ![AmulB](../assets/bench_AmulB_v2.png)
-This is classic GEMM, `𝐂 = 𝐀 * 𝐁`. GFortran's intrinsic `matmul` function does fairly well. But all the compilers are well behind LoopVectorization here, which falls behind MKL's `gemm` beyond 70x70 or so. The problem imposed by alignment is also striking: performance is much higher when the sizes are integer multiplies of 8. Padding arrays so that each column is aligned regardless of the number of rows can thus be very profitable. [PaddedMatrices.jl](https://github.com/chriselrod/PaddedMatrices.jl) offers just such arrays in Julia. I believe that is also what the [-pad](https://software.intel.com/en-us/fortran-compiler-developer-guide-and-reference-pad-qpad) compiler flag does when using Intel's compilers.
+This is classic GEMM, `𝐂 = 𝐀 * 𝐁`. GFortran's intrinsic `matmul` function does fairly well. But all the compilers are well behind LoopVectorization here, which falls behind MKL's `gemm` beyond 70x70 or so. The problem imposed by alignment is also striking: performance is much higher when the sizes are integer multiplies of 8. Padding arrays so that each column is aligned regardless of the number of rows can thus be very profitable. [PaddedMatrices.jl](https://github.com/JuliaSIMD/PaddedMatrices.jl) offers just such arrays in Julia. I believe that is also what the [-pad](https://software.intel.com/en-us/fortran-compiler-developer-guide-and-reference-pad-qpad) compiler flag does when using Intel's compilers.
 
 ![AmulBt](../assets/bench_AmulBt_v2.png)
 The optimal pattern for `𝐂 = 𝐀 * 𝐁ᵀ` is almost identical to that for `𝐂 = 𝐀 * 𝐁`. Yet, gfortran's `matmul` instrinsic stumbles, surprisingly doing much worse than gfortran + loops, and almost certainly worse than allocating memory for `𝐁ᵀ` and creating the ecplicit copy.
 
@@ -183,12 +183,27 @@ end
     1+1
     ifelselastexpr(false, M, (V1,V2), 2, S, true)
 end
+@generated function subset_vec_unroll(vu::VecUnroll{N}, ::StaticInt{S}) where {N,S}
+    (1 ≤ S ≤ N + 1) || throw(ArgumentError("`vu` isa `VecUnroll` of `$(N+1)` elements, but trying to subset $S of them."))
+    t = Expr(:tuple)
+    gf = GlobalRef(Core,:getfield)
+    S == 1 && return Expr(:block, Expr(:meta,:inline), :($gf($gf(vu,1),1,false)))
+    for s ∈ 1:S
+        push!(t.args, Expr(:call, gf, :vud, s, false))
+    end
+    quote
+        $(Expr(:meta,:inline))
+        vud = $gf(vu, 1)
+        VecUnroll($t)
+    end
+end
 # `S` is the ind to replace with the return value of previous invocation ("S" for "self") if reducing
 @generated function partialmap(f::F, default::D, ::StaticInt{M}, ::StaticInt{S}, vargs::Vararg{Any,K}) where {F,M,K,D,S}
     lengths = Vector{Int}(undef, K);
     q = Expr(:block, Expr(:meta,:inline))
     syms = Vector{Symbol}(undef, K)
     isnotpartial = true
+    gf = GlobalRef(Core, :getfield)
     for k ∈ 1:K
         l = vecunrolllen(vargs[k])
         # if l
@@ -197,27 +212,27 @@ end
         lengths[k] = l
         @assert (l == -1) || (l ≥ M)
         syms[k] = symk = Symbol(:vargs_,k)
-        extractq = :(getfield(vargs, $k, false))
+        extractq = :($gf(vargs, $k, false))
         if l != -1
             extractq = :(data($extractq))
         end
         push!(q.args, :($symk = $extractq))
     end
-    if isnotpartial
+    Dlen = vecunrolllen(D)
+    N = maximum(lengths)
+    Sreduced = (S > 0) && (lengths[S] == -1) && N != -1
+    if isnotpartial & (Sreduced | (Dlen == N))
         q =  Expr(:call, :f)
         for k ∈ 1:K
-            push!(q.args, :(getfield(vargs, $k, false)))
+            push!(q.args, :($gf(vargs, $k, false)))
         end
         return Expr(:block, Expr(:meta, :inline), q)
     end
-    N = maximum(lengths)
-    Dlen = vecunrolllen(D)
-    Sreduced = (S > 0) && (lengths[S] == -1) && N != -1
     if Sreduced
         M = N
         t = q
     else
-        @assert (N == Dlen)
+        @assert (N ≤ Dlen)
         if Dlen == -1
             @assert (M == 1)
         else
@@ -231,10 +246,11 @@ end
             if lengths[k] == -1
                 push!(call.args, syms[k])
             else
-                push!(call.args, Expr(:call, :getfield, syms[k], m, false))
+                push!(call.args, Expr(:call, gf, syms[k], m, false))
             end
         end
-        if N == -1
+        # minimal change in behavior to fix case when !Sreduced by N -> Dlen; TODO: what should Dlen be here?
+        if Sreduced ? (N == -1) : (Dlen == -1) 
             push!(q.args, call)
             return q
         end
@@ -245,8 +261,8 @@ end
         end
     end
     Sreduced && return q
-    for m ∈ M+1:N 
-        push!(t.args, :(getfield(dd, $m, false)))
+    for m ∈ M+1:max(N,Dlen)
+        push!(t.args, :($gf(dd, $m, false)))
     end
     push!(q.args, :(VecUnroll($t)))
     q
@@ -257,6 +273,7 @@ function parent_op_name(
 )
     opp = parents_op[n]
     parent = mangledvar(opp)
+    u = 0
     if n == tiledouterreduction
         parent = Symbol(parent, modsuffix)
     else
@@ -275,7 +292,15 @@ function parent_op_name(
     if opisvectorized && isload(opp) && (!isvectorized(opp))
         parent = Symbol(parent, "##broadcasted##")
     end
-    parent
+    parent, u
+end
+function getuouterreduct(ls::LoopSet, op::Operation, suffix)
+    us = ls.unrollspecification[]
+    if us.vloopnum === us.u₁loopnum # unroll u₂
+        suffix
+    else # unroll u₁
+        us.u₁
+    end
 end
 
 function getu₁full(ls::LoopSet, u₁::Int)
@@ -324,6 +349,7 @@ function lower_compute!(
     opunrolled = u₁unrolledsym || isu₁unrolled(op)
     # parent_names, parents_u₁syms, parents_u₂syms = parent_unroll_status(op, u₁loop, u₂loop, suffix)
     parents_u₁syms, parents_u₂syms = parent_unroll_status(op, u₁loopsym, u₂loopsym, vloopsym, u₂max)
+    # tiledouterreduction = if num_loops(ls) == 1# (suffix == -1)# || (vloopsym === u₂loopsym)
     tiledouterreduction = if (suffix == -1)# || (vloopsym === u₂loopsym)
         suffix_ = Symbol("")
         -1
@@ -367,6 +393,9 @@ function lower_compute!(
     # parentsyms = [opp.variable for opp ∈ parents(op)]
     Uiter = opunrolled ? u₁ - 1 : 0
     isreduct = isreduction(op)
+    # if isreduct
+    #     @show u₁unrolledsym, u₂unrolledsym, isu₁unrolled(op), isu₂unrolled(op) op
+    # end
     if Base.libllvm_version < v"11.0.0" && (suffix ≠ -1) && isreduct# && (iszero(suffix) || (ls.unrollspecification[].u₂ - 1 == suffix))
     # if (length(reduceddependencies(op)) > 0) | (length(reducedchildren(op)) > 0)# && (iszero(suffix) || (ls.unrollspecification[].u₂ - 1 == suffix))
         # instrfid = findfirst(isequal(instr.instr), (:vfmadd, :vfnmadd, :vfmsub, :vfnmsub))
@@ -408,8 +437,8 @@ function lower_compute!(
         #     modsuffix = ls.unrollspecification[].u₁#getu₁full(ls, u₁)#u₁
         #     Symbol(mangledvar(op), '_', modsuffix)
         # else
-            modsuffix = suffix % tiled_outerreduct_unroll(ls)
-            Symbol(mangledvar(op), modsuffix)
+        modsuffix = 0#suffix % tiled_outerreduct_unroll(ls)
+        Symbol(mangledvar(op), modsuffix)
         # end
         # dopartialmap = u₁ > 1
 
@@ -421,7 +450,8 @@ function lower_compute!(
             # isouterreduct = true
             isouterreduct = isanouterreduction(ls, op)
             u₁reduct = isouterreduct ? getu₁full(ls, u₁) : getu₁forreduct(ls, op, u₁)
-            dopartialmap = u₁reduct > u₁
+            # @show isouterreduct, u₁reduct, op
+            dopartialmap = u₁reduct ≠ u₁
             Symbol(mvar, '_', u₁reduct)
         else
             Symbol(mvar, '_', u₁)
@@ -441,7 +471,10 @@ function lower_compute!(
             if ((isvectorized(first(parents_op)) && !isvectorized(op)) && !dependent_outer_reducts(ls, op)) ||
                 (parents_u₁syms[n] != u₁unrolledsym) || (parents_u₂syms[n] != u₂unrolledsym)
 
-                selfopname = parent_op_name(ls, parents_op, n, modsuffix, suffix_, parents_u₁syms, parents_u₂syms, u₁, opisvectorized, tiledouterreduction)
+                selfopname, uₚ = parent_op_name(ls, parents_op, n, modsuffix, suffix_, parents_u₁syms, parents_u₂syms, u₁, opisvectorized, tiledouterreduction)
+                # if (uₚ ≠ 0) & (uₚ ≠ u₁)
+                #     dopartialmap = true
+                # end
                 push!(instrcall.args, selfopname)
             else
                 push!(instrcall.args, varsym)
@@ -450,8 +483,20 @@ function lower_compute!(
             # this checks if the parent is u₂ unrolled but this operation is not, in which case we need to reduce it.
             push!(instrcall.args, reduce_expr_u₂(mangledvar(opp), instruction(opp), ureduct(ls)))
         else
-            parent = parent_op_name(ls, parents_op, n, modsuffix, suffix_, parents_u₁syms, parents_u₂syms, u₁, opisvectorized, tiledouterreduction)
-            push!(instrcall.args, parent)
+            parent, uₚ = parent_op_name(ls, parents_op, n, modsuffix, suffix_, parents_u₁syms, parents_u₂syms, u₁, opisvectorized, tiledouterreduction)
+            # if name(op) === Symbol("##op#9536")
+            #     @show parent
+            # end
+            if (selfdep == 0) && search_tree(parents(opp), name(op))
+                selfdep = n
+                push!(instrcall.args, parent)
+            elseif (uₚ ≠ 0) & (uₚ > u₁)
+                push!(instrcall.args, :(subset_vec_unroll($parent, StaticInt{$u₁}())))
+            else
+                push!(instrcall.args, parent)
+            end
+
+            # @show uₚ, u₁, op
         end
     end
     selfdepreduce = ifelse(((!u₁unrolledsym) & isu₁unrolled(op)) & (u₁ > 1), selfdep, 0)
Original file line number	Diff line number	Diff line change
`@@ -27,12 +27,12 @@ makedocs(;`
`27`	`27`	`"devdocs/reference.md"`
`28`	`28`	`]`
`29`	`29`	`],`
`30`		`- # repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",`
	`30`	`+ # repo="https://github.com/JuliaSIMD/LoopVectorization.jl/blob/{commit}{path}#L{line}",`
`31`	`31`	`sitename="LoopVectorization.jl",`
`32`	`32`	`authors="Chris Elrod"`
`33`	`33`	`# assets=[],`
`34`	`34`	`)`
`35`	`35`
`36`	`36`	`deploydocs(;`
`37`		`- repo="github.com/chriselrod/LoopVectorization.jl",`
	`37`	`+ repo="github.com/JuliaSIMD/LoopVectorization.jl",`
`38`	`38`	`)`