Implemented better loop outlining. Should implement an even better version later...

chriselrod · chriselrod · commit a73a1010fb03 · 2020-01-31T18:26:07.000-05:00
diff --git a/src/add_compute.jl b/src/add_compute.jl
@@ -106,7 +106,11 @@ function add_reduction_update_parent!(
         elseif reduct_zero === :one
             push!(ls.preamble_ones, identifier(reductinit))
         else
-            pushpreamble!(ls, Expr(:(=), name(reductinit), reductzero))
+            if reductzero === :true || reductzero === :false
+                pushpreamble!(ls, Expr(:(=), name(reductinit), reductzero))
+            else
+                pushpreamble!(ls, Expr(:(=), name(reductinit), Expr(:call, reductzero, ls.T)))
+            end
             pushpreamble!(ls, op, name, reductinit)
         end
         if isconstant(parent) && reduct_zero === parent.instruction.mod #we can use parent op as initialization.
@@ -166,7 +170,7 @@ function add_compute!(
         elseif arg ∈ ls.loopsymbols
             loopsym = gensym(arg)
             pushpreamble!(ls, Expr(:(=), loopsym, LoopValue()))
-            loopsymop = add_simple_load!(ls, gensym(loopsym), ArrayReference(loopsym, [arg]), elementbytes)
+            loopsymop = add_simple_load!(ls, gensym(loopsym), ArrayReference(loopsym, [arg]), elementbytes, false)
             push!(ls.syms_aliasing_refs, name(loopsymop))
             push!(ls.refs_aliasing_syms, loopsymop.ref)
             pushparent!(parents, deps, reduceddeps, loopsymop)
diff --git a/src/add_loads.jl b/src/add_loads.jl
@@ -27,7 +27,7 @@ end
 
 # for use with broadcasting
 function add_simple_load!(
-    ls::LoopSet, var::Symbol, ref::ArrayReference, elementbytes::Int
+    ls::LoopSet, var::Symbol, ref::ArrayReference, elementbytes::Int, actualarray::Bool = true
 )
     loopdeps = Symbol[s for s ∈ ref.indices]
     mref = ArrayReferenceMeta(
@@ -38,7 +38,7 @@ function add_simple_load!(
         :getindex, memload, loopdeps,
         NODEPENDENCY, NOPARENTS, mref
     )
-    add_vptr!(ls, op)
+    add_vptr!(ls, op.ref.ref.array, vptr(op.ref), actualarray)
     pushop!(ls, op, var)
 end
 function add_load_ref!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int)
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -203,7 +203,41 @@ function generate_call(ls::LoopSet, IUT)
     q
 end
 
-function setup_call(ls::LoopSet, inline = Int8(2), U = zero(Int8), T = zero(Int8))
+function setup_call_noinline(ls::LoopSet, inline = Int8(2), U = zero(Int8), T = zero(Int8))
+    call = generate_call(ls, (inline,U,T))
+    hasouterreductions = length(ls.outer_reductions) > 0
+    q = ls.preamble
+    if hasouterreductions
+        outer_reducts = Expr(:local)
+        for or ∈ ls.outer_reductions
+            op = ls.operations[or]
+            var = name(op)
+            mvar = mangledvar(op)
+            out = Symbol(mvar, 0)
+            push!(outer_reducts.args, out)
+        end
+        push!(q.args, outer_reducts)
+        retv = loopset_return_value(ls, Val(false))
+        call = Expr(:(=), retv, call)
+        push!(q.args, gc_preserve(ls, call))
+        push!(q.args, Expr(:return, retv))
+        q = Expr(:block, Expr(:(=), retv, Expr(:call, Expr(:(->), Expr(:tuple, ls.includedactualarrays...), q), ls.includedactualarrays...)))
+        for or ∈ ls.outer_reductions
+            op = ls.operations[or]
+            var = name(op)
+            mvar = mangledvar(op)
+            instr = instruction(op)
+            out = Symbol(mvar, 0)
+            push!(q.args, Expr(:(=), var, Expr(:call, lv(reduction_scalar_combine(instr)), out, var)))
+        end
+    else
+        push!(q.args, gc_preserve(ls, call))
+        push!(q.args, Expr(:return, :nothing))
+        q = Expr(:call, Expr(:(->), Expr(:tuple, ls.includedactualarrays...), q), ls.includedactualarrays...)
+    end
+    q
+end
+function setup_call_inline(ls::LoopSet, inline = Int8(2), U = zero(Int8), T = zero(Int8))
     call = generate_call(ls, (inline,U,T))
     hasouterreductions = length(ls.outer_reductions) > 0
     if hasouterreductions
@@ -219,12 +253,22 @@ function setup_call(ls::LoopSet, inline = Int8(2), U = zero(Int8), T = zero(Int8
         instr = instruction(op)
         out = Symbol(mvar, 0)
         push!(outer_reducts.args, out)
-        # push!(q.args, Expr(:(=), var, Expr(:call, lv(reduction_scalar_combine(instr)), Expr(:call, lv(:SVec), out), var)))
         push!(q.args, Expr(:(=), var, Expr(:call, lv(reduction_scalar_combine(instr)), out, var)))
     end
     hasouterreductions && pushpreamble!(ls, outer_reducts)
     append!(ls.preamble.args, q.args)
     ls.preamble
 end
-
+function setup_call(ls::LoopSet, inline = Int8(2), U = zero(Int8), T = zero(Int8))
+    # We outline/inline at the macro level by creating/not creating an anonymous function.
+    # The old API instead was based on inlining or not inline the generated function, but
+    # the generated function must be inlined into the initial loop preamble for performance reasons.
+    # Creating an anonymous function and calling it also achieves the outlining, while still
+    # inlining the generated function into the loop preamble.
+    if inline == Int8(2)
+        setup_call_inline(ls, Int8(2), U, T)
+    else
+        setup_call_noinline(ls, Int8(2), U, T)
+    end
+end
 
diff --git a/src/constructors.jl b/src/constructors.jl
@@ -41,14 +41,15 @@ function substitute_broadcast(q::Expr, mod::Symbol)
     ex
 end
 
+
 function LoopSet(q::Expr, mod::Symbol = :LoopVectorization)
     q = SIMDPirates.contract_pass(q)
     ls = LoopSet(mod)
     copyto!(ls, q)
     resize!(ls.loop_order, num_loops(ls))
     ls
 end
-
+LoopSet(q::Expr, m::Module) = LoopSet(macroexpand(m, q), Symbol(m))
 
 """
     @avx
@@ -84,11 +85,10 @@ true
 
 """
 macro avx(q)
-    mod = Symbol(__module__)
     q2 = if q.head === :for
-        setup_call(LoopSet(q, mod))
+        setup_call(LoopSet(q, __module__))
     else# assume broadcast
-        substitute_broadcast(q, mod)
+        substitute_broadcast(q, Symbol(__module__))
     end
     esc(q2)
 end
@@ -130,24 +130,24 @@ macro avx(arg, q)
     @assert q.head === :for
     @assert arg.head === :(=)
     inline, U, T = check_macro_kwarg(arg)
-    esc(setup_call(LoopSet(q, Symbol(__module__)), inline, U, T))
+    esc(setup_call(LoopSet(q, __module__), inline, U, T))
 end
 macro avx(arg1, arg2, q)
     @assert q.head === :for
     inline, U, T = check_macro_kwarg(arg1)
     inline, U, T = check_macro_kwarg(arg2, inline, U, T)
-    esc(setup_call(LoopSet(q, Symbol(__module__)), inline, U, T))
+    esc(setup_call(LoopSet(q, __module__), inline, U, T))
 end
 
 
 
 macro _avx(q)
-    esc(lower(LoopSet(q, Symbol(__module__))))
+    esc(lower(LoopSet(q, __module__)))
 end
 macro _avx(arg, q)
     @assert q.head === :for
     inline, U, T = check_macro_kwarg(arg)
-    esc(lower(LoopSet(q, Symbol(__module__)), U, T))
+    esc(lower(LoopSet(q, __module__), U, T))
 end
 
 
diff --git a/src/costs.jl b/src/costs.jl
@@ -131,6 +131,8 @@ const COST = Dict{Instruction,InstructionCost}(
     Instruction(:>>) => InstructionCost(1, 0.5),
     Instruction(:>>>) => InstructionCost(1, 0.5),
     Instruction(:<<) => InstructionCost(1, 0.5),
+    Instruction(:max) => InstructionCost(4,0.5),
+    Instruction(:min) => InstructionCost(4,0.5),
     Instruction(:ifelse) => InstructionCost(1, 0.5),
     Instruction(:vifelse) => InstructionCost(1, 0.5),
     Instruction(:inv) => InstructionCost(13,4.0,-2.0,1),
@@ -185,6 +187,8 @@ const ADDITIVE_IN_REDUCTIONS = 1.0
 const MULTIPLICATIVE_IN_REDUCTIONS = 2.0
 const ANY = 3.0
 const ALL = 4.0
+const MAX = 5.0
+const MIN = 6.0
 
 const REDUCTION_CLASS = Dict{Symbol,Float64}(
     :+ => ADDITIVE_IN_REDUCTIONS,
@@ -213,28 +217,30 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}(
     :reduced_add => ADDITIVE_IN_REDUCTIONS,
     :reduced_prod => MULTIPLICATIVE_IN_REDUCTIONS,
     :reduced_all => ALL,
-    :reduced_any => ANY
+    :reduced_any => ANY,
+    :max => MAX,
+    :min => MIN
 )
 reduction_instruction_class(instr::Symbol) = get(REDUCTION_CLASS, instr, NaN)
 reduction_instruction_class(instr::Instruction) = get(REDUCTION_CLASS, instr.instr, NaN)
 function reduction_to_single_vector(x::Float64)
-    x == 1.0 ? :evadd : x == 2.0 ? :evmul : x == 3.0 ? :vand : x == 4.0 ? :vor : throw("Reduction not found.")
+    x == 1.0 ? :evadd : x == 2.0 ? :evmul : x == 3.0 ? :vor : x == 4.0 ? :vand : x == 5.0 ? :max : x == 6.0 ? :min : throw("Reduction not found.")
 end
 reduction_to_single_vector(x) = reduction_to_single_vector(reduction_instruction_class(x))
 function reduction_to_scalar(x::Float64)
-    x == 1.0 ? :vsum : x == 2.0 ? :vprod : x == 3.0 ? :vany : x == 4.0 ? :vall : throw("Reduction not found.")
+    x == 1.0 ? :vsum : x == 2.0 ? :vprod : x == 3.0 ? :vany : x == 4.0 ? :vall : x == 5.0 ? :maximum : x == 6.0 ? :minimum : throw("Reduction not found.")
 end
 reduction_to_scalar(x) = reduction_to_scalar(reduction_instruction_class(x))
 function reduction_scalar_combine(x::Float64)
-    x == 1.0 ? :reduced_add : x == 2.0 ? :reduced_prod : x == 3.0 ? :reduced_any : x == 4.0 ? :reduced_all : throw("Reduction not found.")
+    x == 1.0 ? :reduced_add : x == 2.0 ? :reduced_prod : x == 3.0 ? :reduced_any : x == 4.0 ? :reduced_all : x == 5.0 ? :reduced_max : x == 6.0 ? :reduced_min : throw("Reduction not found.")
 end
 reduction_scalar_combine(x) = reduction_scalar_combine(reduction_instruction_class(x))
 function reduction_combine_to(x::Float64)
-    x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 3.0 ? :reduce_to_any : x == 4.0 ? :reduce_to_all : throw("Reduction not found.")
+    x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 3.0 ? :reduce_to_any : x == 4.0 ? :reduce_to_all : x == 5.0 ? :reduce_to_max : x == 6.0 ? :reduce_to_min : throw("Reduction not found.")
 end
 reduction_combine_to(x) = reduction_combine_to(reduction_instruction_class(x))
 function reduction_zero(x::Float64) 
-    x == 1.0 ? :zero : x == 2.0 ? :one : x == 3.0 ? :false : x == 4.0 ? :true : throw("Reduction not found.")
+    x == 1.0 ? :zero : x == 2.0 ? :one : x == 3.0 ? :false : x == 4.0 ? :true : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.")
 end
 reduction_zero(x) = reduction_zero(reduction_instruction_class(x))
 
@@ -291,6 +297,11 @@ const FUNCTIONSYMBOLS = Dict{Type{<:Function},Instruction}(
     typeof(SLEEFPirates.cos) => :cos,
     typeof(sincos) => :sincos,
     typeof(Base.FastMath.sincos_fast) => :sincos,
-    typeof(SLEEFPirates.sincos) => :sincos
+    typeof(SLEEFPirates.sincos) => :sincos,
+    typeof(max) => :max,
+    typeof(min) => :min,
+    typeof(<<) => :<<,
+    typeof(>>) => :>>,
+    typeof(>>>) => :>>>
 )
 
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -157,6 +157,7 @@ struct LoopSet
     preamble_zeros::Vector{Int}
     preamble_ones::Vector{Int}
     includedarrays::Vector{Symbol}
+    includedactualarrays::Vector{Symbol}
     syms_aliasing_refs::Vector{Symbol}
     refs_aliasing_syms::Vector{ArrayReferenceMeta}
     cost_vec::Matrix{Float64}
@@ -228,8 +229,7 @@ function LoopSet(mod::Symbol)# = :LoopVectorization)
         Tuple{Int,Int}[],
         Tuple{Int,Float64}[],
         Int[],Int[],
-        Tuple{Symbol,Int}[],
-        Symbol[],
+        Symbol[], Symbol[], Symbol[],
         ArrayReferenceMeta[],
         Matrix{Float64}(undef, 4, 2),
         Matrix{Int}(undef, 4, 2),
diff --git a/src/lowering.jl b/src/lowering.jl
@@ -204,22 +204,23 @@ function reduce_expr!(q::Expr, ls::LoopSet, U::Int)
     end
 end
 function gc_preserve(ls::LoopSet, q::Expr)
-    length(ls.includedarrays) == 0 && return q
+    length(ls.includedactualarrays) == 0 && return q
     gcp = Expr(:macrocall, Expr(:(.), :GC, QuoteNode(Symbol("@preserve"))), LineNumberNode(@__LINE__, @__FILE__))
-    for array ∈ ls.includedarrays
+    for array ∈ ls.includedactualarrays
         push!(gcp.args, array)
     end
     q.head === :block && push!(q.args, nothing)
     push!(gcp.args, q)
     Expr(:block, gcp)
 end
 function determine_eltype(ls::LoopSet)
-    # length(ls.includedarrays) == 0 && return REGISTER_SIZE >>> 3
-    if length(ls.includedarrays) == 1
-        return Expr(:call, :eltype, first(ls.includedarrays))
+    if length(ls.includedactualarrays) == 0
+        return Expr(:call, :typeof, 0)
+    elseif length(ls.includedactualarrays) == 1
+        return Expr(:call, :eltype, first(ls.includedactualarrays))
     end
     promote_q = Expr(:call, :promote_type)
-    for array ∈ ls.includedarrays
+    for array ∈ ls.includedactualarrays
         push!(promote_q.args, Expr(:call, :eltype, array))
     end
     promote_q
diff --git a/src/memory_ops_common.jl b/src/memory_ops_common.jl
@@ -1,8 +1,9 @@
 add_vptr!(ls::LoopSet, op::Operation) = add_vptr!(ls, op.ref)
 add_vptr!(ls::LoopSet, mref::ArrayReferenceMeta) = add_vptr!(ls, mref.ref.array, vptr(mref))
-function add_vptr!(ls::LoopSet, array::Symbol, vptrarray::Symbol = vptr(array))
+function add_vptr!(ls::LoopSet, array::Symbol, vptrarray::Symbol = vptr(array), actualarray::Bool = true)
     if !includesarray(ls, array)
         push!(ls.includedarrays, array)
+        actualarray && push!(ls.includedactualarrays, array)
         pushpreamble!(ls, Expr(:(=), vptrarray, Expr(:call, lv(:stridedpointer), array)))
     end
     nothing
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -94,14 +94,14 @@ end
     end
     function dot_unroll2avx_noinline(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
         z = zero(T)
-        @avx inline=true unroll=2 for i ∈ 1:length(x)
+        @avx inline=false unroll=2 for i ∈ 1:length(x)
             z += x[i]*y[i]
         end
         return z
     end
     function dot_unroll3avx_inline(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
         z = zero(T)
-        @avx unroll=3 inline=false for i ∈ 1:length(x)
+        @avx unroll=3 inline=true for i ∈ 1:length(x)
             z += x[i]*y[i]
         end
         return z
@@ -245,6 +245,9 @@ end
                 res[i] = sin(i * code_phase_delta)
             end
         end
+        @macroexpand @avx for i ∈ eachindex(res)
+                res[i] = sin(i * code_phase_delta)
+            end
         function calc_sins_avx!(res::AbstractArray{T}) where {T}
             code_phase_delta = T(0.01)
             @_avx for i ∈ eachindex(res)