JuliaSIMD
diff --git a/‎src/costs.jl
Lines changed: 6 additions & 5 deletions b/‎src/costs.jl
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/determinestrategy.jl
Lines changed: 35 additions & 16 deletions b/‎src/determinestrategy.jl
Lines changed: 35 additions & 16 deletions
diff --git a/‎src/graphs.jl
Lines changed: 46 additions & 16 deletions b/‎src/graphs.jl
Lines changed: 46 additions & 16 deletions
@@ -32,9 +32,9 @@ function vector_cost(instruction::InstructionCost, Wshift, sizeof_T)
         W = 1 << Wshift
         extra_latency = sl - srt
         srt *= W
-        sl = srt + extra_latency
+        sl = round(Int, srt + extra_latency)
     else # we assume custom cost, and that latency == recip_throughput
-        sl, srt = scaling, scaling
+        sl, srt = round(Int,scaling), scaling
     end    
     srt, sl, srp
 end
@@ -63,8 +63,8 @@ const OPAQUE_INSTRUCTION = InstructionCost(50, 50.0, -1.0, VectorizationBase.REG
 #    consolidated into a single register. The number of LICM-ed setindex!, on the other
 #    hand, should indicate how many registers we're keeping live for the sake of eventually storing.
 const COST = Dict{Symbol,InstructionCost}(
-    :getindex => InstructionCost(3,0.5,-3.0,0),
-    :setindex! => InstructionCost(3,1.0,-3.0,1),
+    :getindex => InstructionCost(-3.0,0.5,3,0),
+    :setindex! => InstructionCost(-3.0,1.0,3,1),
     :zero => InstructionCost(1,0.5),
     :one => InstructionCost(3,0.5),
     :(+) => InstructionCost(4,0.5),
@@ -93,7 +93,8 @@ const COST = Dict{Symbol,InstructionCost}(
     :exp => InstructionCost(20,20.0,20.0,18),
     :sin => InstructionCost(18,15.0,68.0,23),
     :cos => InstructionCost(18,15.0,68.0,26),
-    :sincos => InstructionCost(25,22.0,70.0,26)
+    :sincos => InstructionCost(25,22.0,70.0,26)#,
+    # Symbol("##CONSTANT##") => InstructionCost(0,0.0)
 )
 for (k, v) ∈ COST # so we can look up Symbol(typeof(function))
     COST[Symbol("typeof(", k, ")")] = v
 
@@ -2,14 +2,16 @@
 # TODO: FIXME for general case
 unitstride(op, s) = first(loopdependencies(op)) === s
 
-function cost(op::Operation, unrolled::Symbol, Wshift::Int, size_T::Int)
+function cost(op::Operation, unrolled::Symbol, Wshift::Int, size_T::Int = op.elementbytes)
+    isconstant(op) && return 0.0, 0, 0
     # Wshift == dependson(op, unrolled) ? Wshift : 0
     # c = first(cost(instruction(op), Wshift, size_T))::Int
     instr = instruction(op)
     opisunrolled = dependson(op, unrolled)
     srt, sl, srp = opisunrolled ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr)
     if accesses_memory(op)
         # either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter
+        # @show instr, unrolled, loopdependencies(op), unitstride(op, unrolled)
         if opisunrolled
             if !unitstride(op, unrolled)# || !isdense(op) # need gather/scatter
                 r = (1 << Wshift)
@@ -72,7 +74,9 @@ function evaluate_cost_unroll(
             included_vars[id] && continue
             # it must also be a subset of defined symbols
             loopdependencies(op) ⊆ nested_loop_syms || continue
-            hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf
+            # hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf
+            rd = reduceddependencies(op)
+            hasintersection(rd, nested_loop_syms[1:end-length(rd)]) && return Inf
             included_vars[id] = true
 
             total_cost += iter * first(cost(op, unrolled, Wshift, size_T))
@@ -97,7 +101,8 @@ function depchain_cost!(
     if accesses_memory(op)
         return sl, rt
     end
-    slᵢ, rtᵢ = cost(op, 1 << Wshift, Wshift, unrolled)
+    # @show instruction(op)
+    rtᵢ, slᵢ = cost(op, unrolled, Wshift, size_T)
     sl + slᵢ, rt + rtᵢ
 end
 
@@ -110,6 +115,7 @@ function determine_unroll_factor(
     # The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
     # The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
     num_reductions = sum(isreduction, operations(ls))
+    # @show num_reductions
     iszero(num_reductions) && return 1
     # So if num_reductions > 0, we set the unroll factor to be high enough so that the CPU can be kept busy
     # if there are, U = max(1, round(Int, max(latency) * throughput / num_reductions)) = max(1, round(Int, latency / (recip_throughput * num_reductions)))
@@ -119,7 +125,7 @@ function determine_unroll_factor(
     visited_nodes = fill(false, length(operations(ls)))
     for op ∈ operations(ls)
         if isreduction(op) && dependson(op, unrolled)
-            sl, rt = depchain_cost!(visited_nodes, instruction(op), unrolled, Wshift, size_T)
+            sl, rt = depchain_cost!(visited_nodes, op, unrolled, Wshift, size_T)
             latency = max(sl, latency)
             recip_throughput += rt
         end
@@ -131,17 +137,19 @@ function tile_cost(X, U, T)
     X[1] + X[4] + X[2] / T + X[3] / U
 end
 function solve_tilesize(X, R)
+    first(R) == 0 && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
     # We use lagrange multiplier to finding floating point values for U and T
     # first solving for U via quadratic formula
     # X is vector of costs, and R is of register pressures
-    @show X
-    @show R 
+    # @show X
+    # @show R 
     RR = VectorizationBase.REGISTER_COUNT - R[3] - R[4]
     a = (R[1])^2*X[2] - (R[2])^2*R[1]*X[3]/RR
     b = 2*R[1]*R[2]*X[3]
     c = -RR*R[1]*X[3]
     Ufloat = (sqrt(b^2 - 4a*c) - b) / (2a)
     Tfloat = (RR - Ufloat*R[2])/(Ufloat*R[1])
+    # @show Ufloat, Tfloat
     Ulow = max(1, floor(Int, Ufloat)) # must be at least 1
     Tlow = max(1, floor(Int, Tfloat)) # must be at least 1
     Uhigh = Ulow + 1 #ceil(Int, Ufloat)
@@ -150,14 +158,17 @@ function solve_tilesize(X, R)
     RR = VectorizationBase.REGISTER_COUNT - R[3] - R[4]
     U, T = Ulow, Tlow
     tcost = tile_cost(X, Ulow, Tlow)
-    if RR > Ulow*Thigh*R[1] + Ulow*R[2]
+    # @show Ulow*Thigh*R[1] + Ulow*R[2]
+    if RR ≥ Ulow*Thigh*R[1] + Ulow*R[2]
         tcost_temp = tile_cost(X, Ulow, Thigh)
+        # @show tcost_temp, tcost
         if tcost_temp < tcost
             tcost = tcost_temp
             U, T = Ulow, Thigh
         end
     end
-    if RR > Uhigh*Tlow*R[1] + Uhigh*R[2]
+    # @show Uhigh*Tlow*R[1] + Uhigh*R[2]
+    if RR ≥ Uhigh*Tlow*R[1] + Uhigh*R[2]
         tcost_temp = tile_cost(X, Uhigh, Tlow)
         if tcost_temp < tcost
             tcost = tcost_temp
@@ -175,7 +186,9 @@ end
 function solve_tilesize_constT(X, R, T)
     floor(Int, (VectorizationBase.REGISTER_COUNT - R[3] - R[4]) / (T * R[1] + R[2]))
 end
+# Tiling here is about alleviating register pressure for the UxT
 function solve_tilesize(X, R, Umax, Tmax)
+    first(R) == 0 && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
     U, T, cost = solve_tilesize(X, R)
     U_too_large = U > Umax
     T_too_large = T > Tmax
@@ -215,9 +228,12 @@ function evaluate_cost_tile(
     # cost_mat[1] / ( unrolled * tiled)
     # cost_mat[2] / ( tiled)
     # cost_mat[3] / ( unrolled)
-    # cost_mat[4] 
+    # cost_mat[4]
+    # @show order
     cost_vec = zeros(Float64, 4)
     reg_pressure = zeros(Int, 4)
+    @inbounds reg_pressure[2] = 1
+    @inbounds reg_pressure[3] = 1
     for n ∈ 1:N
         itersym = order[n]
         # Add to set of defined symbles
@@ -235,16 +251,17 @@ function evaluate_cost_tile(
             included_vars[id] && continue
             # it must also be a subset of defined symbols
             loopdependencies(op) ⊆ nested_loop_syms || continue
-            # @show nested_loop_syms
-            # @show reduceddependencies(op)
+            # # @show nested_loop_syms
+            # # @show reduceddependencies(op)
             rd = reduceddependencies(op)
             hasintersection(rd, nested_loop_syms[1:end-length(rd)]) && return 0,0,Inf
             included_vars[id] = true
             rt, lat, rp = cost(op, unrolled, Wshift, size_T)
-            @show instruction(op), rt, lat, rp, iter
+            # @show instruction(op), rt, lat, rp, iter
             rt *= iter
             isunrolled = unrolled ∈ loopdependencies(op)
             istiled = tiled ∈ loopdependencies(op)
+            # @show isunrolled, istiled
             if isunrolled && istiled # no cost decrease; cost must be repeated
                 cost_vec[1] += rt
                 reg_pressure[1] += rp
@@ -315,8 +332,8 @@ end
 # that I could come up with.
 function Base.iterate(lo::LoopOrders, state)
     advance_state!(state) || return nothing
-    # @show state
-    syms = copy!(lo.buff, lo.syms)
+    # # @show state
+    syms = copyto!(lo.buff, lo.syms)
     for i ∈ eachindex(state)
         sᵢ = state[i]
         sᵢ == 0 || swap!(syms, i, i + sᵢ)
@@ -340,15 +357,15 @@ function choose_unroll_order(ls::LoopSet, lowest_cost::Float64 = Inf)
 end
 function choose_tile(ls::LoopSet)
     lo = LoopOrders(ls)
-    best_order = lo.syms
+    best_order = copy(lo.syms)
     new_order, state = iterate(lo) # right now, new_order === best_order
     U, T, lowest_cost = 0, 0, Inf
     while true
         U_temp, T_temp, cost_temp = evaluate_cost_tile(ls, new_order)
         if cost_temp < lowest_cost
             lowest_cost = cost_temp
             U, T = U_temp, T_temp
-            best_order = new_order
+            copyto!(best_order, new_order)
         end
         iter = iterate(lo, state)
         iter === nothing && return best_order, U, T, lowest_cost
@@ -363,8 +380,10 @@ function choose_order(ls::LoopSet)
     end
     uorder, uc = choose_unroll_order(ls, tc)
     if num_loops(ls) <= 1 || tc > uc # if tc == uc, then that probably means we want tc, and no unrolled managed to beat the tiled cost
+        # copyto!(ls.loop_order.loopnames, uorder)
         return uorder, determine_unroll_factor(ls, uorder), -1
     else
+        # copyto!(ls.loop_order.loopnames, torder)
         return torder, tU, tT
     end
 end
 
@@ -60,14 +60,14 @@ LoopOrder() = LoopOrder(Vector{Operation}[],Symbol[])
 Base.empty!(lo::LoopOrder) = foreach(empty!, lo.oporder)
 function Base.resize!(lo::LoopOrder, N::Int)
     Nold = length(lo.loopnames)
-    resize!(lo.oporder, 24N)
-    for n ∈ 24Nold+1:24N
+    resize!(lo.oporder, 32N)
+    for n ∈ 32Nold+1:32N
         lo.oporder[n] = Operation[]
     end
     resize!(lo.loopnames, N)
     lo
 end
-Base.size(lo::LoopOrder) = (3,2,2,2,length(lo.loopnames))
+Base.size(lo::LoopOrder) = (4,2,2,2,length(lo.loopnames))
 Base.@propagate_inbounds Base.getindex(lo::LoopOrder, i::Int) = lo.oporder[i]
 Base.@propagate_inbounds Base.getindex(lo::LoopOrder, i...) = lo.oporder[LinearIndices(size(lo))[i...]]
 
@@ -80,6 +80,7 @@ struct LoopSet
     loop_order::LoopOrder
     # stridesets::Dict{ShortVector{Symbol},ShortVector{Symbol}}
     preamble::Expr # TODO: add preamble to lowering
+    includedarrays::Vector{Symbol}
 end
 function LoopSet()
     LoopSet(
@@ -88,13 +89,14 @@ function LoopSet()
         Operation[],
         Int[],
         LoopOrder(),
-        Expr(:block,)
+        Expr(:block,),
+        Symbol[]
     )
 end
 num_loops(ls::LoopSet) = length(ls.loops)
 function oporder(ls::LoopSet)
     N = length(ls.loop_order.loopnames)
-    reshape(ls.loop_order.oporder, (3,2,2,2,N))
+    reshape(ls.loop_order.oporder, (4,2,2,2,N))
 end
 names(ls::LoopSet) = ls.loop_order.loopnames
 isstaticloop(ls::LoopSet, s::Symbol) = ls.loops[s].hintexact
@@ -163,7 +165,7 @@ function register_single_loop!(ls::LoopSet, looprange::Expr)
             Loop(itersym, N)
         end
     elseif f === :eachindex
-        N = gensym(:loop, itersym)
+        N = gensym(Symbol(:loop, itersym))
         push!(ls.preamble.args, Expr(:(=), N, Expr(:call, :length, r.args[2])))
         Loop(itersym, N)
     else
@@ -191,11 +193,19 @@ function add_loop!(ls::LoopSet, q::Expr, elementbytes::Int = 8)
         Base.push!(ls, q, elementbytes)
     end
 end
+function add_vptr!(ls::LoopSet, indexed::Symbol)
+    if indexed ∉ ls.includedarrays
+        push!(ls.includedarrays, indexed)
+        push!(ls.preamble.args, Expr(:(=), Symbol(:vptr_, indexed), Expr(:call, Expr(:(.), :VectorizationBase, QuoteNode(:stridedpointer)), indexed)))
+    end
+    nothing
+end
 
 function add_load!(
     ls::LoopSet, var::Symbol, indexed::Symbol, indices::AbstractVector, elementbytes::Int = 8
 )
     op = Operation( length(operations(ls)), var, elementbytes, :getindex, memload, indices, [indexed], NOPARENTS )
+    add_vptr!(ls, indexed)
     pushop!(ls, op, var)
 end
 function add_load_ref!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int = 8)
@@ -226,12 +236,26 @@ function setdiffv!(s3::AbstractVector{T}, s1::AbstractVector{T}, s2::AbstractVec
         (s ∈ s2) || (s ∉ s3 && push!(s3, s))
     end
 end
-function add_constant!(ls::LoopSet, var::Symbol, elementbytes::Int = 8, deps = NODEPENDENCY)
-    pushop!(ls, Operation(length(operations(ls)), var, elementbytes, :undef, constant, deps, NODEPENDENCY, NOPARENTS), var)
+# This version has no dependencies, and thus will not be lowered
+### if it is a literal, that literal is either var"##ZERO#Float##", var"##ONE#Float##", or has to have been assigned to var in the preamble.
+# if it is a literal, that literal has to have been assigned to var in the preamble.
+function add_constant!(ls::LoopSet, var::Symbol, elementbytes::Int = 8)
+    pushop!(ls, Operation(length(operations(ls)), var, elementbytes, Symbol("##CONSTANT##"), constant, NODEPENDENCY, NODEPENDENCY, NOPARENTS), var)
 end
-function add_constant!(ls, var, elementbytes::Int = 8, sym = gensym(:constant), deps = NODEPENDENCY)
+function add_constant!(ls::LoopSet, var, elementbytes::Int = 8)
+    sym = gensym(:temp)
     push!(ls.preamble.args, Expr(:(=), sym, var))
-    add_constant!(ls, sym, elementbytes, deps)
+    pushop!(ls, Operation(length(operations(ls)), sym, elementbytes, Symbol("##CONSTANT##"), constant, NODEPENDENCY, NODEPENDENCY, NOPARENTS), sym)
+end
+# This version has loop dependencies. var gets assigned to sym when lowering.
+function add_constant!(ls::LoopSet, var::Symbol, deps::Vector{Symbol}, sym::Symbol = gensym(:constant), elementbytes::Int = 8)
+    # length(deps) == 0 && push!(ls.preamble.args, Expr(:(=), sym, var))
+    pushop!(ls, Operation(length(operations(ls)), sym, elementbytes, var, constant, deps, NODEPENDENCY, NOPARENTS), sym)
+end
+function add_constant!(ls::LoopSet, var, deps::Vector{Symbol}, sym::Symbol = gensym(:constant), elementbytes::Int = 8)
+    sym2 = gensym(:temp)
+    push!(ls.preamble.args, Expr(:(=), sym2, var))
+    pushop!(ls, Operation(length(operations(ls)), sym, elementbytes, sym2, constant, deps, NODEPENDENCY, NOPARENTS), sym)
 end
 function pushparent!(parents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, parent::Operation)
     push!(parents, parent)
@@ -258,7 +282,7 @@ end
 function add_reduction!(
     parents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet, var::Symbol, elementbytes::Int = 8
 )
-    parent = get!(ls.opdict, var) do
+    get!(ls.opdict, var) do
         p = add_constant!(ls, var, elementbytes)
         push!(ls.outer_reductions, identifier(p))
         p
@@ -287,6 +311,7 @@ function add_compute!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int = 8)
         parent = getop(ls, var)
         setdiffv!(reduceddeps, deps, loopdependencies(parent))
         pushparent!(parents, deps, reduceddeps, parent) # deps and reduced deps will not be disjoint
+        # append!(reduceddependencies(parent), reduceddeps)
     end
     op = Operation(length(operations(ls)), var, elementbytes, instr, compute, deps, reduceddeps, parents)
     pushop!(ls, op, var) # note this overwrites the entry in the operations dict, but not the vector
@@ -296,6 +321,7 @@ function add_store!(
 )
     parent = getop(ls, var)
     op = Operation( length(operations(ls)), indexed, elementbytes, :setindex!, memstore, indices, reduceddependencies(parent), [parent] )
+    add_vptr!(ls, indexed)
     pushop!(ls, op, var)
 end
 function add_store_ref!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int = 8)
@@ -335,7 +361,7 @@ function Base.push!(ls::LoopSet, ex::Expr, elementbytes::Int = 8)
             if RHS isa Expr
                 add_operation!(ls, LHS, RHS, elementbytes)
             else
-                add_constant!(ls, RHS, elementbytes, LHS, [keys(ls.loops)...])
+                add_constant!(ls, RHS, [keys(ls.loops)...], LHS, elementbytes)
             end
         elseif LHS isa Expr
             @assert LHS.head === :ref
@@ -363,7 +389,9 @@ end
 function fillorder!(ls::LoopSet, order::Vector{Symbol}, loopistiled::Bool)
     lo = ls.loop_order
     ro = lo.loopnames # reverse order; will have same order as lo
-    copyto!(lo.names, order)
+    # @show 1, ro, order
+    # copyto!(ro, order)
+    # @show 2, ro, order
     empty!(lo)
     nloops = length(order)
     if loopistiled
@@ -378,17 +406,19 @@ function fillorder!(ls::LoopSet, order::Vector{Symbol}, loopistiled::Bool)
     for _n ∈ 1:nloops
         n = 1 + nloops - _n
         ro[_n] = loopsym = order[n]
+        #loopsym = order[n]
         for (id,op) ∈ enumerate(operations(ls))
             included_vars[id] && continue
-            loopsym ∈ dependencies(op) || continue
+            loopsym ∈ loopdependencies(op) || continue
             included_vars[id] = true
             isunrolled = (unrolled ∈ loopdependencies(op)) + 1
-            istiled = (loopistiled ? false : (tiled ∈ loopdependencies(op))) + 1
-            optype = Int(op.node_type)
+            istiled = (loopistiled ? (tiled ∈ loopdependencies(op)) : false) + 1
+            optype = Int(op.node_type) + 1
             after_loop = (length(reduceddependencies(op)) > 0) + 1
             push!(lo[optype,isunrolled,istiled,after_loop,_n], op)
         end
     end    
+    @show 3, ro, order
 end