JuliaSIMD
diff --git a/‎src/LoopVectorization.jl
Lines changed: 2 additions & 2 deletions b/‎src/LoopVectorization.jl
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/broadcast.jl
Lines changed: 14 additions & 14 deletions b/‎src/broadcast.jl
Lines changed: 14 additions & 14 deletions
diff --git a/‎src/costs.jl
Lines changed: 51 additions & 6 deletions b/‎src/costs.jl
Lines changed: 51 additions & 6 deletions
diff --git a/‎src/determinestrategy.jl
Lines changed: 10 additions & 2 deletions b/‎src/determinestrategy.jl
Lines changed: 10 additions & 2 deletions
@@ -2,8 +2,8 @@ module LoopVectorization
 
 using VectorizationBase, SIMDPirates, SLEEFPirates, MacroTools, Parameters
 using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr,
-    mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub
-using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod
+    mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub, _MM
+using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod
 using Base.Broadcast: Broadcasted, DefaultArrayStyle
 using LinearAlgebra: Adjoint, Transpose
 using MacroTools: prewalk, postwalk
 
@@ -47,12 +47,12 @@ function add_broadcast!(
     K = gensym(:K)
     mA = gensym(:Aₘₖ)
     mB = gensym(:Bₖₙ)
-    pushpreamble!(ls, Expr(:(=), mA, Expr(:(.), bcname, QuoteNode(:a))))
-    pushpreamble!(ls, Expr(:(=), mB, Expr(:(.), bcname, QuoteNode(:b))))
-    pushpreamble!(ls, Expr(:(=), K, Expr(:call, :size, mB, 1)))
+    pushprepreamble!(ls, Expr(:(=), mA, Expr(:(.), bcname, QuoteNode(:a))))
+    pushprepreamble!(ls, Expr(:(=), mB, Expr(:(.), bcname, QuoteNode(:b))))
+    pushprepreamble!(ls, Expr(:(=), K, Expr(:call, :size, mB, 1)))
 
     k = gensym(:k)
-    ls.loops[k] = Loop(k, K)
+    ls.loops[k] = Loop(k, 0, K)
     m = loopsyms[1];
     if ndims(B) == 1
         bloopsyms = Symbol[k]
@@ -74,9 +74,9 @@ function add_broadcast!(
     # set Cₘₙ = 0
     # setC = add_constant!(ls, zero(promote_type(recursive_eltype(A), recursive_eltype(B))), cloopsyms, mC, elementbytes)
     setC = if elementbytes == 4
-        add_constant!(ls, 0f0, cloopsyms, mC, elementbytes)
+        add_constant!(ls, 0f0, cloopsyms, mC, Symbol(""), elementbytes)
     else#if elementbytes == 4
-        add_constant!(ls, 0.0, cloopsyms, mC, elementbytes)
+        add_constant!(ls, 0.0, cloopsyms, mC, Symbol(""), elementbytes)
     end       
     # compute Cₘₙ += Aₘₖ * Bₖₙ
     reductop = Operation(
@@ -111,7 +111,7 @@ function add_broadcast_adjoint_array!(
     ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol}, ::Type{A}, elementbytes::Int = 8
 ) where {T,N,A<:AbstractArray{T,N}}
     parent = gensym(:parent)
-    pushpreamble!(ls, Expr(:(=), parent, Expr(:call, :parent, bcname)))
+    pushprepreamble!(ls, Expr(:(=), parent, Expr(:call, :parent, bcname)))
     ref = ArrayReference(parent, Union{Symbol,Int}[loopsyms[N + 1 - n] for n ∈ 1:N])
     add_simple_load!( ls, destname, ref, elementbytes )::Operation    
 end
@@ -143,7 +143,7 @@ function add_broadcast!(
     ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol}, ::Type{T}, elementbytes::Int = 8
 ) where {T<:Union{Integer,Float32,Float64}}
     op = add_constant!(ls, destname, elementbytes) # or replace elementbytes with sizeof(T) ? u
-    pushpreamble!(ls, Expr(:(=), mangledvar(op), bcname))
+    pushprepreamble!(ls, Expr(:(=), mangledvar(op), bcname))
     op
 end
 function add_broadcast!(
@@ -172,7 +172,7 @@ function add_broadcast!(
     reduceddeps = Symbol[]
     for (i,arg) ∈ enumerate(args)
         argname = gensym(:arg)
-        pushpreamble!(ls, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,@__FILE__), Expr(:(=), argname, Expr(:ref, bcargs, i))))
+        pushprepreamble!(ls, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,@__FILE__), Expr(:(=), argname, Expr(:ref, bcargs, i))))
         # dynamic dispatch
         parent = add_broadcast!(ls, gensym(:temp), argname, loopsyms, arg, elementbytes)::Operation
         pushparent!(parents, deps, reduceddeps, parent)
@@ -195,10 +195,10 @@ end
     sizes = Expr(:tuple)
     for (n,itersym) ∈ enumerate(loopsyms)
         Nsym = gensym(:N)
-        ls.loops[itersym] = Loop(itersym, Nsym)
+        ls.loops[itersym] = Loop(itersym, 0, Nsym)
         push!(sizes.args, Nsym)
     end
-    pushpreamble!(ls, Expr(:(=), sizes, Expr(:call, :size, :dest)))
+    pushprepreamble!(ls, Expr(:(=), sizes, Expr(:call, :size, :dest)))
     elementbytes = sizeof(T)
     add_broadcast!(ls, :dest, :bc, loopsyms, BC, elementbytes)
     add_simple_store!(ls, :dest, ArrayReference(:dest, loopsyms), elementbytes)
@@ -216,14 +216,14 @@ end
     # need to construct the LoopSet
     loopsyms = [gensym(:n) for n ∈ 1:N]
     ls = LoopSet()
-    pushpreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
+    pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
     sizes = Expr(:tuple)
     for (n,itersym) ∈ enumerate(loopsyms)
         Nsym = gensym(:N)
-        ls.loops[itersym] = Loop(itersym, Nsym)
+        ls.loops[itersym] = Loop(itersym, 0, Nsym)
         push!(sizes.args, Nsym)
     end
-    pushpreamble!(ls, Expr(:(=), sizes, Expr(:call, :size, :dest′)))
+    pushprepreamble!(ls, Expr(:(=), sizes, Expr(:call, :size, :dest′)))
     elementbytes = sizeof(T)
     add_broadcast!(ls, :dest, :bc, loopsyms, BC, elementbytes)
     add_simple_store!(ls, :dest, ArrayReference(:dest, reverse(loopsyms)), elementbytes)
 
@@ -84,7 +84,15 @@ const COST = Dict{Instruction,InstructionCost}(
     Instruction(:vadd) => InstructionCost(4,0.5),
     Instruction(:vsub) => InstructionCost(4,0.5),
     Instruction(:vmul) => InstructionCost(4,0.5),
-    Instruction(:vdiv) => InstructionCost(13,4.0,-2.0),
+    Instruction(:vfdiv) => InstructionCost(13,4.0,-2.0),
+    Instruction(:evadd) => InstructionCost(4,0.5),
+    Instruction(:evsub) => InstructionCost(4,0.5),
+    Instruction(:evmul) => InstructionCost(4,0.5),
+    Instruction(:evfdiv) => InstructionCost(13,4.0,-2.0),
+    Instruction(:reduced_add) => InstructionCost(4,0.5),# ignoring reduction part of cost, might be nop
+    Instruction(:reduced_prod) => InstructionCost(4,0.5),# ignoring reduction part of cost, might be nop
+    Instruction(:reduce_to_add) => InstructionCost(0,0.0,0.0,0),
+    Instruction(:reduce_to_prod) => InstructionCost(0,0.0,0.0,0),
     Instruction(:abs2) => InstructionCost(4,0.5),
     Instruction(:vabs2) => InstructionCost(4,0.5),
     Instruction(:(==)) => InstructionCost(1, 0.5),
@@ -110,14 +118,20 @@ const COST = Dict{Instruction,InstructionCost}(
     Instruction(:vfnmadd_fast) => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfnmsub_fast) => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
     Instruction(:sqrt) => InstructionCost(15,4.0,-2.0),
+    Instruction(:sqrt_fast) => InstructionCost(15,4.0,-2.0),
     Instruction(:log) => InstructionCost(20,20.0,40.0,20),
     Instruction(:exp) => InstructionCost(20,20.0,20.0,18),
     Instruction(:sin) => InstructionCost(18,15.0,68.0,23),
     Instruction(:cos) => InstructionCost(18,15.0,68.0,26),
     Instruction(:sincos) => InstructionCost(25,22.0,70.0,26),
+    Instruction(:log_fast) => InstructionCost(20,20.0,40.0,20),
+    Instruction(:exp_fast) => InstructionCost(20,20.0,20.0,18),
+    Instruction(:sin_fast) => InstructionCost(18,15.0,68.0,23),
+    Instruction(:cos_fast) => InstructionCost(18,15.0,68.0,26),
+    Instruction(:sincos_fast) => InstructionCost(25,22.0,70.0,26),
     Instruction(:identity) => InstructionCost(0,0.0,0.0,0),
     Instruction(:adjoint) => InstructionCost(0,0.0,0.0,0),
-    Instruction(:transpose) => InstructionCost(0,0.0,0.0,0),
+    Instruction(:transpose) => InstructionCost(0,0.0,0.0,0)
     # Symbol("##CONSTANT##") => InstructionCost(0,0.0)
 )
 # for (k, v) ∈ COST # so we can look up Symbol(typeof(function))
@@ -131,6 +145,9 @@ const CORRESPONDING_REDUCTION = Dict{Instruction,Instruction}(
     Instruction(:vadd) => Instruction(:vsum),
     Instruction(:vsub) => Instruction(:vsum),
     Instruction(:vmul) => Instruction(:vprod),
+    Instruction(:evadd) => Instruction(:vsum),
+    Instruction(:evsub) => Instruction(:vsum),
+    Instruction(:evmul) => Instruction(:vprod),
     Instruction(:&) => Instruction(:vall),
     Instruction(:|) => Instruction(:vany),
     Instruction(:muladd) => Instruction(:vsum),
@@ -140,7 +157,11 @@ const CORRESPONDING_REDUCTION = Dict{Instruction,Instruction}(
     Instruction(:vfmadd) => Instruction(:vsum),
     Instruction(:vfmsub) => Instruction(:vsum),
     Instruction(:vfnmadd) => Instruction(:vsum),
-    Instruction(:vfnmsub) => Instruction(:vsum)
+    Instruction(:vfnmsub) => Instruction(:vsum),
+    Instruction(:vfmadd_fast) => Instruction(:vsum),
+    Instruction(:vfmsub_fast) => Instruction(:vsum),
+    Instruction(:vfnmadd_fast) => Instruction(:vsum),
+    Instruction(:vfnmsub_fast) => Instruction(:vsum)
 )
 const REDUCTION_TRANSLATION = Dict{Instruction,Instruction}(
     Instruction(:+) => Instruction(:evadd),
@@ -158,25 +179,37 @@ const REDUCTION_TRANSLATION = Dict{Instruction,Instruction}(
     Instruction(:vfmadd) => Instruction(:evadd),
     Instruction(:vfmsub) => Instruction(:evadd),
     Instruction(:vfnmadd) => Instruction(:evadd),
-    Instruction(:vfnmsub) => Instruction(:evadd)
+    Instruction(:vfnmsub) => Instruction(:evadd),
+    Instruction(:vfmadd_fast) => Instruction(:evadd),
+    Instruction(:vfmsub_fast) => Instruction(:evadd),
+    Instruction(:vfnmadd_fast) => Instruction(:evadd),
+    Instruction(:vfnmsub_fast) => Instruction(:evadd)
 )
 const REDUCTION_ZERO = Dict{Instruction,Symbol}(
     Instruction(:+) => :zero,
     Instruction(:vadd) => :zero,
+    Instruction(:evadd) => :zero,
     Instruction(:*) => :one,
     Instruction(:vmul) => :one,
+    Instruction(:evmul) => :one,
     Instruction(:-) => :zero,
     Instruction(:vsub) => :zero,
+    Instruction(:evsub) => :zero,
     Instruction(:/) => :one,
     Instruction(:vfdiv) => :one,
+    Instruction(:evfdiv) => :one,
     Instruction(:muladd) => :zero,
     Instruction(:fma) => :zero,
     Instruction(:vmuladd) => :zero,
     Instruction(:vfma) => :zero,
     Instruction(:vfmadd) => :zero,
     Instruction(:vfmsub) => :zero,
     Instruction(:vfnmadd) => :zero,
-    Instruction(:vfnmsub) => :zero    
+    Instruction(:vfnmsub) => :zero,
+    Instruction(:vfmadd_fast) => :zero,
+    Instruction(:vfmsub_fast) => :zero,
+    Instruction(:vfnmadd_fast) => :zero,
+    Instruction(:vfnmsub_fast) => :zero
 )
 
 lv(x) = GlobalRef(LoopVectorization, x)
@@ -197,7 +230,15 @@ const REDUCTION_SCALAR_COMBINE = Dict{Instruction,GlobalRef}(
     Instruction(:vfmadd) => lv(:reduced_add),
     Instruction(:vfmsub) => lv(:reduced_add),
     Instruction(:vfnmadd) => lv(:reduced_add),
-    Instruction(:vfnmsub) => lv(:reduced_add)
+    Instruction(:vfnmsub) => lv(:reduced_add),
+    Instruction(:vfmadd_fast) => lv(:reduced_add),
+    Instruction(:vfmsub_fast) => lv(:reduced_add),
+    Instruction(:vfnmadd_fast) => lv(:reduced_add),
+    Instruction(:vfnmsub_fast) => lv(:reduced_add)
+)
+const REDUCTION_COMBINETO = Dict{Symbol,Symbol}(
+    :reduced_add => :reduce_to_add,
+    :reduced_prod => :reduce_to_prod
 )
 
 const FUNCTIONSYMBOLS = Dict{Type{<:Function},Instruction}(
@@ -230,6 +271,10 @@ const FUNCTIONSYMBOLS = Dict{Type{<:Function},Instruction}(
     typeof(SIMDPirates.vfmsub) => :vfmsub,
     typeof(SIMDPirates.vfnmadd) => :vfnmadd,
     typeof(SIMDPirates.vfnmsub) => :vfnmsub,
+    typeof(SIMDPirates.vfmadd_fast) => :vfmadd_fast,
+    typeof(SIMDPirates.vfmsub_fast) => :vfmsub_fast,
+    typeof(SIMDPirates.vfnmadd_fast) => :vfnmadd_fast,
+    typeof(SIMDPirates.vfnmsub_fast) => :vfnmsub_fast,
     typeof(sqrt) => :sqrt,
     typeof(Base.FastMath.sqrt_fast) => :sqrt,
     typeof(SIMDPirates.vsqrt) => :sqrt,
 
@@ -110,6 +110,13 @@ function parentsnotreduction(op::Operation)
     end
     return true
 end
+function roundpow2(i::Integer)
+    u = VectorizationBase.nextpow2(i)
+    l = u >>> 1
+    ud = u - i
+    ld = i - l
+    ud > ld ? l : u
+end
 function unroll_no_reductions(ls, order, vectorized, Wshift, size_T)
     innermost = last(order)
     compute_rt = 0.0
@@ -125,7 +132,7 @@ function unroll_no_reductions(ls, order, vectorized, Wshift, size_T)
     end
     # heuristic guess
     # @show compute_rt, load_rt
-    min(4, round(Int, (compute_rt + load_rt + 1) / compute_rt))
+    roundpow2(min(4, round(Int, (compute_rt + load_rt + 1) / compute_rt)))
 end
 function determine_unroll_factor(
     ls::LoopSet, order::Vector{Symbol}, unrolled::Symbol, vectorized::Symbol = first(order)
@@ -171,7 +178,7 @@ function determine_unroll_factor(
         load_recip_throughput,
         store_recip_throughput
     )
-    max(1, round(Int, latency / (recip_throughput * num_reductions) ) )
+    roundpow2(max(1, round(Int, latency / (recip_throughput * num_reductions) ) ))
 end
 
 function tile_cost(X, U, T)
@@ -434,6 +441,7 @@ function choose_unroll_order(ls::LoopSet, lowest_cost::Float64 = Inf)
 end
 function choose_tile(ls::LoopSet)
     lo = LoopOrders(ls)
+    # @show lo.syms ls.loop_order.bestorder
     best_order = copyto!(ls.loop_order.bestorder, lo.syms)
     best_vec = first(best_order) # filler
     new_order, state = iterate(lo) # right now, new_order === best_order