JuliaSIMD
diff --git a/‎src/LoopVectorization.jl
Lines changed: 2 additions & 0 deletions b/‎src/LoopVectorization.jl
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/add_compute.jl
Lines changed: 42 additions & 11 deletions b/‎src/add_compute.jl
Lines changed: 42 additions & 11 deletions
diff --git a/‎src/determinestrategy.jl
Lines changed: 79 additions & 20 deletions b/‎src/determinestrategy.jl
Lines changed: 79 additions & 20 deletions
diff --git a/‎src/graphs.jl
Lines changed: 13 additions & 17 deletions b/‎src/graphs.jl
Lines changed: 13 additions & 17 deletions
@@ -26,6 +26,8 @@ export LowDimArray, stridedpointer, vectorizable,
     vmap, vmap!, vmapnt, vmapnt!, vmapntt, vmapntt!,
     vfilter, vfilter!
 
+const VECTORWIDTHSYMBOL, ELTYPESYMBOL = Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##")
+
 
 include("vectorizationbase_extensions.jl")
 include("predicates.jl")
 
@@ -100,17 +100,48 @@ end
 #     end
 #     false
 # end
+function add_reduced_deps!(op::Operation, reduceddeps::Vector{Symbol})
+    # op.dependencies = copy(loopdependencies(op))
+    # mergesetv!(loopdependencies(op), reduceddeps)
+    reduceddepsop = reduceddependencies(op)
+    if reduceddepsop === NODEPENDENCY
+        op.reduced_deps = copy(reduceddeps)
+    else
+        mergesetv!(reduceddepsop, reduceddeps)
+    end
+    # reduceddepsop = reducedchildren(op)
+    # if reduceddepsop === NODEPENDENCY
+    #     op.reduced_children = copy(reduceddeps)
+    # else
+    #     mergesetv!(reduceddepsop, reduceddeps)
+    # end
+    nothing
+end
 
-# function substitute_op_in_parents!(vparents::Vector{Operation}, replacer::Operation, replacee::Operation)
-#     for i ∈ eachindex(vparents)
-#         opp = vparents[i]
-#         if opp === replacee
-#             vparents[i] = replacer
-#         else
-#             substitute_op_in_parents!(parents(opp), replacer, replacee)
-#         end
-#     end
+# function substitute_op_in_parents!(
+#     vparents::Vector{Operation}, replacer::Operation, replacee::Operation, reduceddeps::Vector{Symbol}
+# )
+#     @show replacer replacee
+#     # 
+#     substitute_op_in_parents_recurse!(vparents, replacer, replacee)
 # end
+function substitute_op_in_parents!(
+    vparents::Vector{Operation}, replacer::Operation, replacee::Operation, reduceddeps::Vector{Symbol}
+)
+    found = false
+    for i ∈ eachindex(vparents)
+        opp = vparents[i]
+        if opp === replacee
+            vparents[i] = replacer
+            found = true
+        else
+            fopp = substitute_op_in_parents!(parents(opp), replacer, replacee, reduceddeps)
+            fopp && add_reduced_deps!(opp, reduceddeps)
+            found |= fopp
+        end
+    end
+    found
+end
 
 
 function add_reduction_update_parent!(
@@ -157,8 +188,8 @@ function add_reduction_update_parent!(
         if instr.instr ∈ (:-, :vsub!, :vsub, :/, :vfdiv!, :vfidiv!)
             update_deps!(deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint
         end
-    # elseif !isouterreduction
-        # substitute_op_in_parents!(vparents, reductinit, parent)
+    elseif !isouterreduction
+        substitute_op_in_parents!(vparents, reductinit, parent, reduceddeps)
     end
     update_reduction_status!(vparents, reduceddeps, name(reductinit))
     # this is the op added by add_compute
 
@@ -618,6 +618,7 @@ function add_constant_offset_load_elmination_cost!(
     end
 end
 
+
 # Just tile outer two loops?
 # But optimal order within tile must still be determined
 # as well as size of the tiles.
@@ -789,35 +790,93 @@ function choose_unroll_order(ls::LoopSet, lowest_cost::Float64 = Inf)
         new_order, state = iter
     end
 end
+
+
+"""
+This function searches for unrolling combinations that will cause LoopVectorization to generate invalid code.
+
+Currently, it is only searching for one scenario, based on how `isunrolled_sym` and lowering currently work.
+`isunrolledsym` tries to avoid the creation of excessive numbers of accumulation vectors in the case of reductions.
+If an unrolled loop isn't reduced, it will need separate vectors.
+But separate vectors for a reduced loop are not needed. Separate vectors will help to break up dependency chains,
+so you want to unroll at least one of the loops. However, reductions demand combining all the separate vectors,
+and each vector also eats a valuable register, so it's best to avoid excessive numbers these accumulation vectors.
+
+
+If a reduced op depends on both unrolled loops (u1 and u2), it will check over which of these it is reduced. If...
+neither: cannot avoid unrolling it along both
+one of them: don't unroll the reduced loop
+both of them: don't unroll along u2 (unroll along u1)
+
+Now, a look at lowering:
+It interleaves u1-unrolled operations in an effort to improve superscalar parallelism,
+while u2-unrolled operations are lowered by block. E.g., op_u2id_u1id (as they're printed):
+
+u2 = 0
+opa_0_0 = fa(...)
+opa_0_1 = fa(...)
+opa_0_2 = fa(...)
+opb_0_0 = fb(...)
+opb_0_1 = fb(...)
+opb_0_2 = fb(...)
+u2 += 1
+opa_1_0 = fa(...)
+opa_1_1 = fa(...)
+opa_1_2 = fa(...)
+opb_1_0 = fb(...)
+opb_1_1 = fb(...)
+opb_1_2 = fb(...)
+
+what if `opa` vectors were not replicated across u1?
+opa_0_ = fa(...)
+opa_0_ = fa(...)
+opa_0_ = fa(...)
+
+Then unless `fa` was taking the previous `opa_0_`s as an argument and updating them, this would be wrong, because it'd be overwriting the previous `opa_0_` values.
+"""
+function reject_candidate(op::Operation, u₁loopsym::Symbol, u₂loopsym::Symbol)
+    if iscompute(op) && u₁loopsym ∈ reduceddependencies(op) && u₁loopsym ∈ loopdependencies(op)
+        if u₂loopsym ∉ reduceddependencies(op) && !any(opp -> name(opp) === name(op), parents(op))
+            return true
+        end
+    end
+    false
+end
+
+function reject_candidate(ls::LoopSet, u₁loopsym::Symbol, u₂loopsym::Symbol)
+    for op ∈ operations(ls)
+        reject_candidate(op, u₁loopsym, u₂loopsym) && return true
+    end
+    false
+end
+
 function choose_tile(ls::LoopSet)
     lo = LoopOrders(ls)
     best_order = copyto!(ls.loop_order.bestorder, lo.syms)
     bestu₁ = bestu₂ = best_vec = first(best_order) # filler
-    new_order, state = iterate(lo) # right now, new_order === best_order
     u₁, u₂, lowest_cost = 0, 0, Inf
-    nloops = length(new_order)
-    while true
-        for new_vec ∈ new_order # view to skip first
-            for nt ∈ 1:nloops-1
-                newu₂ = new_order[nt]
-                for newu₁ ∈ @view(new_order[nt+1:end])
-                    u₁temp, u₂temp, cost_temp = evaluate_cost_tile(ls, new_order, newu₁, newu₂, new_vec)
-                    if cost_temp < lowest_cost
-                        lowest_cost = cost_temp
-                        u₁, u₂ = u₁temp, u₂temp
-                        best_vec = new_vec
-                        bestu₂ = newu₂
-                        bestu₁ = newu₁
-                        copyto!(best_order, new_order)
-                        save_tilecost!(ls)
-                    end
+    for newu₂ ∈ lo.syms, newu₁ ∈ lo.syms#@view(new_order[nt+1:end])
+        ((newu₁ == newu₂) || reject_candidate(ls, newu₁, newu₂)) && continue
+        new_order, state = iterate(lo) # right now, new_order === best_order
+        while true
+            for new_vec ∈ new_order # view to skip first
+                u₁temp, u₂temp, cost_temp = evaluate_cost_tile(ls, new_order, newu₁, newu₂, new_vec)
+                if cost_temp < lowest_cost
+                    lowest_cost = cost_temp
+                    u₁, u₂ = u₁temp, u₂temp
+                    best_vec = new_vec
+                    bestu₂ = newu₂
+                    bestu₁ = newu₁
+                    copyto!(best_order, new_order)
+                    save_tilecost!(ls)
                 end
             end
+            iter = iterate(lo, state)
+            iter === nothing && break
+            new_order, state = iter
         end
-        iter = iterate(lo, state)
-        iter === nothing && return best_order, bestu₁, bestu₂, best_vec, u₁, u₂, lowest_cost
-        new_order, state = iter
     end
+    best_order, bestu₁, bestu₂, best_vec, u₁, u₂, lowest_cost
 end
 # Last in order is the inner most loop
 function choose_order_cost(ls::LoopSet)
 
@@ -44,26 +44,26 @@ function Loop(itersymbol::Symbol, start::Union{Int,Symbol}, stop::Union{Int,Symb
 end
 Base.length(loop::Loop) = 1 + loop.stophint - loop.starthint
 isstaticloop(loop::Loop) = loop.startexact & loop.stopexact
-function startloop(loop::Loop, isvectorized, W, itersymbol)
+function startloop(loop::Loop, isvectorized, itersymbol)
     startexact = loop.startexact
     if isvectorized
         if startexact
-            Expr(:(=), itersymbol, Expr(:call, lv(:_MM), W, loop.starthint))
+            Expr(:(=), itersymbol, Expr(:call, lv(:_MM), VECTORWIDTHSYMBOL, loop.starthint))
         else
-            Expr(:(=), itersymbol, Expr(:call, lv(:_MM), W, loop.startsym))
+            Expr(:(=), itersymbol, Expr(:call, lv(:_MM), VECTORWIDTHSYMBOL, loop.startsym))
         end
     elseif startexact
         Expr(:(=), itersymbol, loop.starthint)
     else
         Expr(:(=), itersymbol, Expr(:call, lv(:unwrap), loop.startsym))
     end
 end
-function vec_looprange(loop::Loop, W::Symbol, UF::Int, mangledname::Symbol)
+function vec_looprange(loop::Loop, UF::Int, mangledname::Symbol)
     isunrolled = UF > 1
     incr = if isunrolled
-        Expr(:call, lv(:valmuladd), W, UF, -2)
+        Expr(:call, lv(:valmuladd), VECTORWIDTHSYMBOL, UF, -2)
     else
-        Expr(:call, lv(:valsub), W, 2)
+        Expr(:call, lv(:valsub), VECTORWIDTHSYMBOL, 2)
     end
     if loop.stopexact # split for type stability
         Expr(:call, lv(:scalar_less), mangledname, Expr(:call, :-, loop.stophint, incr))
@@ -80,22 +80,22 @@ function looprange(loop::Loop, incr::Int, mangledname::Symbol)
     end
 end
 function terminatecondition(
-    loop::Loop, us::UnrollSpecification, n::Int, W::Symbol, mangledname::Symbol, inclmask::Bool, UF::Int = unrollfactor(us, n)
+    loop::Loop, us::UnrollSpecification, n::Int, mangledname::Symbol, inclmask::Bool, UF::Int = unrollfactor(us, n)
 )
     if !isvectorized(us, n)
         looprange(loop, UF, mangledname)
     elseif inclmask
         looprange(loop, 1, mangledname)
     else
-        vec_looprange(loop, W, UF, mangledname) # may not be u₂loop
+        vec_looprange(loop, UF, mangledname) # may not be u₂loop
     end
 end
-function incrementloopcounter(us::UnrollSpecification, n::Int, W::Symbol, mangledname::Symbol, UF::Int = unrollfactor(us, n))
+function incrementloopcounter(us::UnrollSpecification, n::Int, mangledname::Symbol, UF::Int = unrollfactor(us, n))
     if isvectorized(us, n)
         if UF == 1
-            Expr(:(=), mangledname, Expr(:call, lv(:valadd), W, mangledname))
+            Expr(:(=), mangledname, Expr(:call, lv(:valadd), VECTORWIDTHSYMBOL, mangledname))
         else
-            Expr(:+=, mangledname, Expr(:call, lv(:valmul), W, UF))
+            Expr(:+=, mangledname, Expr(:call, lv(:valmul), VECTORWIDTHSYMBOL, UF))
         end
     else
         Expr(:+=, mangledname, UF)
@@ -158,8 +158,6 @@ struct LoopSet
     reg_pres::Matrix{Float64}
     included_vars::Vector{Bool}
     place_after_loop::Vector{Bool}
-    W::Symbol
-    T::Symbol
     mod::Symbol
 end
 
@@ -240,10 +238,9 @@ end
 #     false
 # end
 
-
 includesarray(ls::LoopSet, array::Symbol) = array ∈ ls.includedarrays
 
-function LoopSet(mod::Symbol, W = Symbol("##Wvecwidth##"), T = Symbol("##Tloopeltype##"))# = :LoopVectorization)
+function LoopSet(mod::Symbol)
     LoopSet(
         Symbol[], [0], Loop[],
         Dict{Symbol,Operation}(),
@@ -259,8 +256,7 @@ function LoopSet(mod::Symbol, W = Symbol("##Wvecwidth##"), T = Symbol("##Tloopel
         ArrayReferenceMeta[],
         Matrix{Float64}(undef, 4, 2),
         Matrix{Float64}(undef, 4, 2),
-        Bool[], Bool[],
-        W, T, mod
+        Bool[], Bool[], mod
     )
 end