Working on adding basic if/else support, as well as LoopSet -> Type -> LoopSet conversion.

chriselrod · chriselrod · commit 1433577a8770 · 2020-01-17T18:18:20.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.3.8"
+version = "0.3.9"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -0,0 +1,9 @@
+
+## turn a LoopSet into a type object which can be used to reconstruct the LoopSet.
+
+
+# Try to condense in type stable manner
+function condense_operations(ls::LoopSet)
+    
+end
+
diff --git a/src/costs.jl b/src/costs.jl
@@ -103,6 +103,8 @@ const COST = Dict{Instruction,InstructionCost}(
     Instruction(:(<)) => InstructionCost(1, 0.5),
     Instruction(:(>=)) => InstructionCost(1, 0.5),
     Instruction(:(<=)) => InstructionCost(1, 0.5),
+    Instruction(:ifelse) => InstructionCost(1, 0.5),
+    Instruction(:vifelse) => InstructionCost(1, 0.5),
     Instruction(:inv) => InstructionCost(13,4.0,-2.0,1),
     Instruction(:vinv) => InstructionCost(13,4.0,-2.0,1),
     Instruction(:muladd) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
@@ -131,7 +133,7 @@ const COST = Dict{Instruction,InstructionCost}(
     Instruction(:sincos_fast) => InstructionCost(25,22.0,70.0,26),
     Instruction(:identity) => InstructionCost(0,0.0,0.0,0),
     Instruction(:adjoint) => InstructionCost(0,0.0,0.0,0),
-    Instruction(:transpose) => InstructionCost(0,0.0,0.0,0)
+    Instruction(:transpose) => InstructionCost(0,0.0,0.0,0),
     # Symbol("##CONSTANT##") => InstructionCost(0,0.0)
 )
 # for (k, v) ∈ COST # so we can look up Symbol(typeof(function))
diff --git a/src/determinestrategy.jl b/src/determinestrategy.jl
@@ -3,6 +3,13 @@
 # wrong for transposed matrices, and certain views/SubArrays.
 unitstride(op::Operation, s) = first(getindices(op)) === s
 
+function register_pressure(op::Operation)
+    if isconstant(op)
+        0
+    else
+        instruction_cost(instruction(op)).register_pressure
+    end
+end
 function cost(op::Operation, unrolled::Symbol, Wshift::Int, size_T::Int = op.elementbytes)
     isconstant(op) && return 0.0, 0, 1
     # Wshift == dependson(op, unrolled) ? Wshift : 0
@@ -45,6 +52,10 @@ function hasintersection(a, b)
     end
     false
 end
+function num_iterations(N, step)
+    iter, rem = divrem(N, step)
+    iter + (rem != 0)
+end
 
 # evaluates cost of evaluating loop in given order
 # heuristically, could simplify analysis by just unrolling outer loop?
@@ -62,10 +73,8 @@ function evaluate_cost_unroll(
     for itersym ∈ order
         # Add to set of defined symbles
         push!(nested_loop_syms, itersym)
-        liter = Float64(length(ls, itersym))
-        if itersym === vectorized
-            liter /= W
-        end
+        looplength = length(ls, itersym)
+        liter = itersym === vectorized ? num_iterations(looplength, W) : looplength
         iter *= liter
         # check which vars we can define at this level of loop nest
         for (id,op) ∈ enumerate(operations(ls))
@@ -183,16 +192,16 @@ function determine_unroll_factor(
     roundpow2(max(1, round(Int, latency / (recip_throughput * num_reductions) ) ))
 end
 
-function tile_cost(X, U, T)
-    X[1] + X[4] + X[2] / T + X[3] / U
+function tile_cost(X, U, T, UL, TL)
+    X[1] + X[4] + X[2] * (num_iterations(TL, T)/TL) + X[3] * (num_iterations(UL, U)/UL)
 end
-function solve_tilesize(X, R)
+function solve_tilesize(X, R, UL, TL)
     @inbounds any(iszero, (R[1],R[2],R[3])) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
     # @inbounds any(iszero, (R[1],R[2],R[3])) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
     # We use a lagrange multiplier to find floating point values for U and T
     # first solving for U via quadratic formula
     # X is vector of costs, and R is of register pressures
-    RR = REGISTER_COUNT - R[3] - R[4]
+    RR = REGISTER_COUNT - R[3] - R[4] # RR ≡ RemainingRegisters
     a = (R[1])^2*X[2] - (R[2])^2*R[1]*X[3]/RR
     b = 2*R[1]*R[2]*X[3]
     c = -RR*R[1]*X[3]
@@ -205,12 +214,12 @@ function solve_tilesize(X, R)
     Uhigh = Ulow + 1 #ceil(Int, Ufloat)
     Thigh = Tlow + 1 #ceil(Int, Tfloat)
 
-    RR = REGISTER_COUNT - R[3] - R[4]
+    # RR = REGISTER_COUNT - R[3] - R[4]
     U, T = Ulow, Tlow
-    tcost = tile_cost(X, Ulow, Tlow)
+    tcost = tile_cost(X, Ulow, Tlow, UL, TL)
     # @show Ulow*Thigh*R[1] + Ulow*R[2]
     if RR ≥ Ulow*Thigh*R[1] + Ulow*R[2]
-        tcost_temp = tile_cost(X, Ulow, Thigh)
+        tcost_temp = tile_cost(X, Ulow, Thigh, UL, TL)
         # @show tcost_temp, tcost
         if tcost_temp < tcost
             tcost = tcost_temp
@@ -222,7 +231,7 @@ function solve_tilesize(X, R)
     while RR < Uhigh*Tl*R[1] + Uhigh*R[2]
         Tl -= 1
     end
-    tcost_temp = tile_cost(X, Uhigh, Tl)
+    tcost_temp = tile_cost(X, Uhigh, Tl, UL, TL)
     if tcost_temp < tcost
         tcost = tcost_temp
         U, T = Uhigh, Tl
@@ -243,9 +252,9 @@ function solve_tilesize_constT(ls, T)
     floor(Int, (REGISTER_COUNT - R[3] - R[4]) / (T * R[1] + R[2]))
 end
 # Tiling here is about alleviating register pressure for the UxT
-function solve_tilesize(X, R, Umax, Tmax)
+function solve_tilesize(X, R, Umax, Tmax, UL, TL)
     first(R) == 0 && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
-    U, T, cost = solve_tilesize(X, R)
+    U, T, cost = solve_tilesize(X, R, UL, TL)
     # T -= T & 1
     # U = min(U, T)
     U_too_large = U > Umax
@@ -264,20 +273,37 @@ function solve_tilesize(X, R, Umax, Tmax)
     end
     U, T, cost
 end
+function maybedemotesize(U::Int, N::Int)
+    U > 1 || return 1
+    Um1 = U - 1
+    urep = num_iterations(N, U)
+    um1rep = num_iterations(N, Um1)
+    um1rep > urep ? U : Um1
+end
 function solve_tilesize(
     ls::LoopSet, unrolled::Symbol, tiled::Symbol,
     cost_vec::AbstractVector{Float64} = @view(ls.cost_vec[:,1]),
     reg_pressure::AbstractVector{Int} = @view(ls.reg_pres[:,1])
 )
     maxT = 4#8
     maxU = 4#8
-    if isstaticloop(ls, tiled)
-        maxT = min(2maxT, looprangehint(ls, tiled))
+    tiledloop = ls.loops[tiled]
+    unrolledloop = ls.loops[unrolled]
+    if isstaticloop(tiledloop)
+        maxT = min(4maxT, length(tiledloop))
+    end
+    if isstaticloop(unrolledloop)
+        maxU = min(4maxU, length(unrolledloop))
+    end
+    U, T, cost = solve_tilesize(cost_vec, reg_pressure, maxU, maxT, length(unrolledloop), length(tiledloop))
+    # heuristic to more evenly divide small numbers of iterations
+    if isstaticloop(tiledloop) & T > 1
+        T = maybedemotesize(T, length(tiledloop))
     end
-    if isstaticloop(ls, unrolled)
-        maxU = min(2maxU, looprangehint(ls, unrolled))
+    if isstaticloop(unrolledloop)
+        U = maybedemotesize(U, length(unrolledloop))
     end
-    solve_tilesize(cost_vec, reg_pressure, maxU, maxT)
+    U, T, cost
 end
 
 function set_upstream_family!(adal::Vector{T}, op::Operation, val::T) where {T}
@@ -306,7 +332,6 @@ function evaluate_cost_tile(
     innerloop = last(order)
     iters = fill(-99.9, nops)
     nested_loop_syms = Symbol[]# Set{Symbol}()
-    iter = 1.0
     # Need to check if fusion is possible
     size_T = biggest_type_size(ls)
     W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vectorized), size_T)::Tuple{Int,Int}
@@ -320,14 +345,18 @@ function evaluate_cost_tile(
     reg_pressure = reg_pres_buf(ls)
     # @inbounds reg_pressure[2] = 1
     # @inbounds reg_pressure[3] = 1
+    unrollediter = length(ls, unrolled)
+    tilediter = length(ls, tiled)
+    unrollediter = unrolled === vectorized ? num_iterations(unrollediter, W) : unrollediter # tiled cannot be vectorized, so do not check
+    iter::Int = tilediter * unrollediter
     for n ∈ 1:N
         itersym = order[n]
         # Add to set of defined symbles
         push!(nested_loop_syms, itersym)
-        if n == 1
-            iter = length(ls, itersym) * length(ls, order[2]) / W
-        elseif n > 2
-            iter *= Float64(length(ls, itersym))
+        stepsize = 1
+        if n > 2
+            itersymlooplen = length(ls, itersym)
+            iter *= itersym === vectorized ? num_iterations(itersymlooplen, W) : itersymlooplen
         end
         # check which vars we can define at this level of loop nest
         for (id, op) ∈ enumerate(ops)
@@ -480,3 +509,19 @@ function choose_order(ls::LoopSet)
     end
 end
 
+function register_pressure(ls::LoopSet)
+    # uses unroll of 1 if not tiling
+    if num_loops(ls) > 1
+        torder, tvec, tU, tT, tc = choose_tile(ls)
+    else
+        tc = Inf
+    end
+    uorder, uvec, uc = choose_unroll_order(ls, tc)
+    if num_loops(ls) > 1 && tc ≤ uc # tile
+        rp = @view ls.reg_pressure[:,1]
+        tU * tT * rp[1] + tU * rp[2] + rp[3] + rp[4]
+    else
+        sum(register_pressure, operations(ls))
+    end    
+end
+
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -324,6 +324,15 @@ function register_single_loop!(ls::LoopSet, looprange::Expr)
         N = gensym(Symbol(:loop, itersym))
         pushpreamble!(ls, Expr(:(=), N, Expr(:call, :length, r.args[2])))
         Loop(itersym, 0, N)
+    elseif f === :OneTo || f === Expr(:(.), :Base, :OneTo)
+        otN = r.args[2]
+        if otN isa Integer
+            Loop(itersym, 0, otN)
+        else
+            N = gensym(Symbol(:loop, itersym))
+            pushpreamble!(ls, Expr(:(=), N, otN))
+            Loop(itersym, 0, N)
+        end
     else
         throw("Unrecognized loop range type: $r.")
     end
@@ -719,6 +728,18 @@ function add_store_setindex!(ls::LoopSet, ex::Expr, elementbytes::Int = 8)
     array, raw_indices = ref_from_setindex(ex)
     add_store!(ls, (ex.args[2])::Symbol, array, rawindices, elementbytes)
 end
+function add_if!(ls::LoopSet, LHS::Symbol, RHS::Expr, elementbytes::Int = 8, mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing)
+    condition = first(RHS.args)
+    m = gensym(:mask)
+    condop = add_compute!(ls, m, condition, elementbytes, mpref)
+    iftrue = RHS.args[2]
+    iftrueisaexpr = iftrue isa Expr
+    iffalse = RHS.args[3]
+    iffalseisaexpr = iffalse isa Expr
+    trueisablock = iftrueisaexpr && iftrue.head !== :call
+    falseisablock = iffalseisaexpr && iffalse.head !== :call
+    
+end
 # add operation assigns X to var
 function add_operation!(
     ls::LoopSet, LHS::Symbol, RHS::Expr, elementbytes::Int = 8
@@ -736,6 +757,8 @@ function add_operation!(
         else
             add_compute!(ls, LHS, RHS, elementbytes)
         end
+    elseif RHS.head === :if
+        add_if!(ls, LHS, RHS, elementbytes)
     else
         throw("Expression not recognized:\n$x")
     end
@@ -757,6 +780,8 @@ function add_operation!(
         else
             add_compute!(ls, LHS_sym, RHS, elementbytes, LHS_ref)
         end
+    elseif RHS.head === :if
+        add_if!(ls, LHS, RHS, elementbytes, LHS_ref)
     else
         throw("Expression not recognized:\n$x")
     end
@@ -817,6 +842,10 @@ function Base.push!(ls::LoopSet, ex::Expr, elementbytes::Int = 8)
         add_block!(ls, ex)
     elseif ex.head === :for
         add_loop!(ls, ex)
+    elseif ex.head === :&&
+        add_andblock!(ls, ex)
+    elseif ex.head === :||
+        add_orblock!(ls, ex)
     else
         throw("Don't know how to handle expression:\n$ex")
     end