Make COST keys Instructions again, so that other libraries may extend them. Add special handling of zeros, emitting vzeros.

chriselrod · chriselrod · commit 758a88516c79 · 2020-01-27T01:01:40.000-05:00
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -3,10 +3,11 @@ module LoopVectorization
 using VectorizationBase, SIMDPirates, SLEEFPirates, MacroTools, Parameters
 using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr,
     mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub, _MM,
-    maybestaticlength, maybestaticsize, staticm1, subsetview,
+    maybestaticlength, maybestaticsize, staticm1, subsetview, vzero,
     Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange,
     PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct
-using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod
+using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
+    vmullog2, vmullog10, vdivlog2, vdivlog2add, vdivlog10, vdivlog10add, vfmaddaddone
 using Base.Broadcast: Broadcasted, DefaultArrayStyle
 using LinearAlgebra: Adjoint, Transpose
 using MacroTools: prewalk, postwalk
diff --git a/src/costs.jl b/src/costs.jl
@@ -64,8 +64,10 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
     end    
     srt, sl, srp
 end
-instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
-instruction_cost(instruction::Instruction) = instruction_cost(instruction.instr)
+# instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
+# instruction_cost(instruction::Instruction) = instruction_cost(instruction.instr)
+instruction_cost(instruction::Instruction) = get(COST, instruction, OPAQUE_INSTRUCTION)
+instruction_cost(instruction::Symbol) = instruction_cost(Instruction(instruction))
 scalar_cost(instr::Instruction) = scalar_cost(instruction_cost(instr))
 vector_cost(instr::Instruction, Wshift, sizeof_T) = vector_cost(instruction_cost(instr), Wshift, sizeof_T)
 function cost(instruction::InstructionCost, Wshift, sizeof_T)
@@ -87,76 +89,78 @@ const OPAQUE_INSTRUCTION = InstructionCost(50, 50.0, -1.0, VectorizationBase.REG
 #    as a heuristic means of approximating register pressure, since many loads can be
 #    consolidated into a single register. The number of LICM-ed setindex!, on the other
 #    hand, should indicate how many registers we're keeping live for the sake of eventually storing.
-const COST = Dict{Symbol,InstructionCost}(
-    :getindex => InstructionCost(-3.0,0.5,3,1),
-    :setindex! => InstructionCost(-3.0,1.0,3,0),
-    :conditionalstore! => InstructionCost(-3.0,1.0,3,0),
-    :zero => InstructionCost(1,0.5),
-    :one => InstructionCost(3,0.5),
-    :(+) => InstructionCost(4,0.5),
-    :(-) => InstructionCost(4,0.5),
-    :(*) => InstructionCost(4,0.5),
-    :(/) => InstructionCost(13,4.0,-2.0),
-    :vadd => InstructionCost(4,0.5),
-    :vsub => InstructionCost(4,0.5),
-    :vmul => InstructionCost(4,0.5),
-    :vfdiv => InstructionCost(13,4.0,-2.0),
-    :evadd => InstructionCost(4,0.5),
-    :evsub => InstructionCost(4,0.5),
-    :evmul => InstructionCost(4,0.5),
-    :evfdiv => InstructionCost(13,4.0,-2.0),
-    :reduced_add => InstructionCost(4,0.5),# ignoring reduction part of cost, might be nop
-    :reduced_prod => InstructionCost(4,0.5),# ignoring reduction part of cost, might be nop
-    :reduce_to_add => InstructionCost(0,0.0,0.0,0),
-    :reduce_to_prod => InstructionCost(0,0.0,0.0,0),
-    :abs2 => InstructionCost(4,0.5),
-    :vabs2 => InstructionCost(4,0.5),
-    :(==) => InstructionCost(1, 0.5),
-    :isequal => InstructionCost(1, 0.5),
-    :(~) => InstructionCost(1, 0.5),
-    :(&) => InstructionCost(1, 0.5),
-    :(|) => InstructionCost(1, 0.5),
-    :(>) => InstructionCost(1, 0.5),
-    :(<) => InstructionCost(1, 0.5),
-    :(>=) => InstructionCost(1, 0.5),
-    :(<=) => InstructionCost(1, 0.5),
-    :ifelse => InstructionCost(1, 0.5),
-    :vifelse => InstructionCost(1, 0.5),
-    :inv => InstructionCost(13,4.0,-2.0,1),
-    :vinv => InstructionCost(13,4.0,-2.0,1),
-    :muladd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
-    :fma => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
-    :vmuladd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
-    :vfma => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
-    :vfmadd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
-    :vfmsub => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
-    :vfnmadd => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
-    :vfnmsub => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
-    :vfmadd_fast => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
-    :vfmsub_fast => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
-    :vfnmadd_fast => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
-    :vfnmsub_fast => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
-    :sqrt => InstructionCost(15,4.0,-2.0),
-    :sqrt_fast => InstructionCost(15,4.0,-2.0),
-    :log => InstructionCost(20,20.0,40.0,20),
-    :exp => InstructionCost(20,20.0,20.0,18),
-    :^ => InstructionCost(40,40.0,40.0,26), # FIXME
-    :sin => InstructionCost(18,15.0,68.0,23),
-    :cos => InstructionCost(18,15.0,68.0,26),
-    :sincos => InstructionCost(25,22.0,70.0,26),
-    :log_fast => InstructionCost(20,20.0,40.0,20),
-    :exp_fast => InstructionCost(20,20.0,20.0,18),
-    :sin_fast => InstructionCost(18,15.0,68.0,23),
-    :cos_fast => InstructionCost(18,15.0,68.0,26),
-    :sincos_fast => InstructionCost(25,22.0,70.0,26),
-    :identity => InstructionCost(0,0.0,0.0,0),
-    :adjoint => InstructionCost(0,0.0,0.0,0),
-    :transpose => InstructionCost(0,0.0,0.0,0),
+const COST = Dict{Instruction,InstructionCost}(
+    Instruction(:getindex) => InstructionCost(-3.0,0.5,3,1),
+    Instruction(:setindex!) => InstructionCost(-3.0,1.0,3,0),
+    Instruction(:conditionalstore!) => InstructionCost(-3.0,1.0,3,0),
+    Instruction(:zero) => InstructionCost(1,0.5),
+    Instruction(:one) => InstructionCost(3,0.5),
+    Instruction(:(+)) => InstructionCost(4,0.5),
+    Instruction(:(-)) => InstructionCost(4,0.5),
+    Instruction(:(*)) => InstructionCost(4,0.5),
+    Instruction(:(/)) => InstructionCost(13,4.0,-2.0),
+    Instruction(:vadd) => InstructionCost(4,0.5),
+    Instruction(:vsub) => InstructionCost(4,0.5),
+    Instruction(:vmul) => InstructionCost(4,0.5),
+    Instruction(:vfdiv) => InstructionCost(13,4.0,-2.0),
+    Instruction(:evadd) => InstructionCost(4,0.5),
+    Instruction(:evsub) => InstructionCost(4,0.5),
+    Instruction(:evmul) => InstructionCost(4,0.5),
+    Instruction(:evfdiv) => InstructionCost(13,4.0,-2.0),
+    Instruction(:reduced_add) => InstructionCost(4,0.5),# ignoring reduction part of cost, might be nop
+    Instruction(:reduced_prod) => InstructionCost(4,0.5),# ignoring reduction part of cost, might be nop
+    Instruction(:reduce_to_add) => InstructionCost(0,0.0,0.0,0),
+    Instruction(:reduce_to_prod) => InstructionCost(0,0.0,0.0,0),
+    Instruction(:abs2) => InstructionCost(4,0.5),
+    Instruction(:vabs2) => InstructionCost(4,0.5),
+    Instruction(:(==)) => InstructionCost(1, 0.5),
+    Instruction(:isequal) => InstructionCost(1, 0.5),
+    Instruction(:(~)) => InstructionCost(1, 0.5),
+    Instruction(:(&)) => InstructionCost(1, 0.5),
+    Instruction(:(|)) => InstructionCost(1, 0.5),
+    Instruction(:(>)) => InstructionCost(1, 0.5),
+    Instruction(:(<)) => InstructionCost(1, 0.5),
+    Instruction(:(>=)) => InstructionCost(1, 0.5),
+    Instruction(:(<=)) => InstructionCost(1, 0.5),
+    Instruction(:ifelse) => InstructionCost(1, 0.5),
+    Instruction(:vifelse) => InstructionCost(1, 0.5),
+    Instruction(:inv) => InstructionCost(13,4.0,-2.0,1),
+    Instruction(:vinv) => InstructionCost(13,4.0,-2.0,1),
+    Instruction(:muladd) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:fma) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vmuladd) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfma) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfmadd) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfmsub) => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfnmadd) => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfnmsub) => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfmadd_fast) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfmsub_fast) => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfnmadd_fast) => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfnmsub_fast) => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:sqrt) => InstructionCost(15,4.0,-2.0),
+    Instruction(:sqrt_fast) => InstructionCost(15,4.0,-2.0),
+    Instruction(:log) => InstructionCost(20,20.0,40.0,20),
+    Instruction(:exp) => InstructionCost(20,20.0,20.0,18),
+    Instruction(:(^)) => InstructionCost(40,40.0,40.0,26), # FIXME
+    Instruction(:sin) => InstructionCost(18,15.0,68.0,23),
+    Instruction(:cos) => InstructionCost(18,15.0,68.0,26),
+    Instruction(:sincos) => InstructionCost(25,22.0,70.0,26),
+    Instruction(:log_fast) => InstructionCost(20,20.0,40.0,20),
+    Instruction(:exp_fast) => InstructionCost(20,20.0,20.0,18),
+    Instruction(:sin_fast) => InstructionCost(18,15.0,68.0,23),
+    Instruction(:cos_fast) => InstructionCost(18,15.0,68.0,26),
+    Instruction(:sincos_fast) => InstructionCost(25,22.0,70.0,26),
+    Instruction(:identity) => InstructionCost(0,0.0,0.0,0),
+    Instruction(:adjoint) => InstructionCost(0,0.0,0.0,0),
+    Instruction(:transpose) => InstructionCost(0,0.0,0.0,0),
     # Symbol("##CONSTANT##") => InstructionCost(0,0.0)
 )
 
-const KNOWNINSTRUCTIONS = keys(COST)
-instruction(f, m) = f ∈ KNOWNINSTRUCTIONS ? Instruction(:LoopVectorization, f) : Instruction(m, f)
+# const KNOWNINSTRUCTIONS = keys(COST)
+# instruction(f, m) = f ∈ KNOWNINSTRUCTIONS ? Instruction(:LoopVectorization, f) : Instruction(m, f)
+instruction(f::Symbol, m) = Instruction(f) ∈ keys(COST) ? Instruction(f) : Instruction(m, f)
+# instruction(f, m) = get(COST, f, Instruction(m, f))
 
 # for (k, v) ∈ COST # so we can look up Symbol(typeof(function))
 #     COST[Symbol("typeof(", lower(k), ")")] = v
diff --git a/src/lower_constant.jl b/src/lower_constant.jl
@@ -1,14 +1,30 @@
-
+function lower_zero!(
+    q::Expr, op::Operation, vectorized::Symbol, W::Symbol, unrolled::Symbol, U::Int,
+    suffix::Union{Nothing,Int}, typeT::Symbol
+)
+    mvar = variable_name(op, suffix)
+    if vectorized ∈ loopdependencies(op) || vectorized ∈ reducedchildren(op) || vectorized ∈ reduceddependencies(op)
+        call = Expr(:call, lv(:vzero), W, typeT)
+    else
+        call = Expr(:call, :zero, typeT)
+    end
+    if unrolled ∈ loopdependencies(op) || unrolled ∈ reducedchildren(op) || unrolled ∈ reduceddependencies(op)
+        for u ∈ 0:U-1
+            push!(q.args, Expr(:(=), Symbol(mvar, u), call))
+        end
+    else
+        push!(q.args, Expr(:(=), mvar, call))
+    end
+    nothing    
+end
 function lower_constant!(
     q::Expr, op::Operation, vectorized::Symbol, W::Symbol, unrolled::Symbol, U::Int,
-    suffix::Union{Nothing,Int}, mask::Any = nothing
+    suffix::Union{Nothing,Int}
 )
     instruction = op.instruction
     mvar = variable_name(op, suffix)
     constsym = instruction.instr
-    # constsym = mangledvar(op)
     if vectorized ∈ loopdependencies(op) || vectorized ∈ reducedchildren(op) || vectorized ∈ reduceddependencies(op)
-        # call = Expr(:call, lv(:vbroadcast), W, mangledvar(op))
         call = Expr(:call, lv(:vbroadcast), W, constsym)
         if unrolled ∈ loopdependencies(op) || unrolled ∈ reducedchildren(op) || unrolled ∈ reduceddependencies(op)
             for u ∈ 0:U-1
@@ -29,6 +45,21 @@ function lower_constant!(
     nothing
 end
 
+function setop!(ls, op, val)
+    if instruction(op) === LOOPCONSTANT# && mangledvar(op) !== val
+        pushpreamble!(ls, Expr(:(=), mangledvar(op), val))
+    else
+        pushpreamble!(ls, Expr(:(=), instruction(op).instr, val))
+    end
+    nothing
+end
+function setconstantop!(ls, op, val)
+    if instruction(op) === LOOPCONSTANT# && mangledvar(op) !== val
+        pushpreamble!(ls, Expr(:(=), mangledvar(op), val))
+    end
+    nothing
+end
+
 
 function lower_licm_constants!(ls::LoopSet)
     ops = operations(ls)
@@ -42,7 +73,7 @@ function lower_licm_constants!(ls::LoopSet)
         setop!(ls, ops[id], Expr(:call, lv(:sizeequivalentfloat), ls.T, intval))
     end
     for id ∈ ls.preamble_zeros
-        setop!(ls, ops[id], Expr(:call, :zero, ls.T))
+        setconstantop!(ls, ops[id], Expr(:call, :zero, ls.T))
     end
     for id ∈ ls.preamble_ones
         setop!(ls, ops[id], Expr(:call, :one, ls.T))
diff --git a/src/lowering.jl b/src/lowering.jl
@@ -8,11 +8,16 @@
 # end
 
 function lower!(
-    q::Expr, op::Operation, vectorized::Symbol, W::Symbol, unrolled::Symbol, tiled::Symbol, U::Int,
-    suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned} = nothing
+    q::Expr, op::Operation, vectorized::Symbol, ls::LoopSet, unrolled::Symbol, tiled::Symbol, U::Int,
+    suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}
 )
+    W = ls.W
     if isconstant(op)
-        lower_constant!(q, op, vectorized, W, unrolled, U, suffix, mask)
+        if identifier(op) ∈ ls.preamble_zeros
+            lower_zero!(q, op, vectorized, W, unrolled, U, suffix, ls.T)
+        else
+            lower_constant!(q, op, vectorized, W, unrolled, U, suffix)
+        end
     elseif isload(op)
         lower_load!(q, op, vectorized, W, unrolled, tiled, U, suffix, mask)
     elseif iscompute(op)
@@ -22,10 +27,10 @@ function lower!(
     end
 end
 function lower!(
-    q::Expr, ops::AbstractVector{Operation}, vectorized::Symbol, W::Symbol, unrolled::Symbol, tiled::Symbol, U::Int,
-    suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned} = nothing
+    q::Expr, ops::AbstractVector{Operation}, vectorized::Symbol, ls::LoopSet, unrolled::Symbol, tiled::Symbol, U::Int,
+    suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}
 )
-    foreach(op -> lower!(q, op, vectorized, W, unrolled, tiled, U, suffix, mask), ops)
+    foreach(op -> lower!(q, op, vectorized, ls, unrolled, tiled, U, suffix, mask), ops)
 end
 
 tiledsym(s::Symbol) = Symbol("##outer##", s, "##outer##")
@@ -70,9 +75,9 @@ function lower_nest(
     end
     for prepost ∈ 1:2
         # !U && !T
-        lower!(blockq, ops[1,1,prepost,n], vectorized, W, unrolled, last(order), U, nothing, mask)
+        lower!(blockq, ops[1,1,prepost,n], vectorized, ls, unrolled, last(order), U, nothing, mask)
         # for u ∈ 0:U-1     #  U && !T
-        lower!(blockq, ops[2,1,prepost,n], vectorized, W, unrolled, last(order), U, nothing, mask)
+        lower!(blockq, ops[2,1,prepost,n], vectorized, ls, unrolled, last(order), U, nothing, mask)
         # end
         if length(ops[1,2,prepost,n]) + length(ops[2,2,prepost,n]) > 0
             for t ∈ 0:T-1
@@ -82,9 +87,9 @@ function lower_nest(
                     push!(blockq.args, Expr(:+=, last(order), 1))
                 end
                 # !U &&  T
-                lower!(blockq, ops[1,2,prepost,n], vectorized, W, unrolled, last(order), U, t, mask)
+                lower!(blockq, ops[1,2,prepost,n], vectorized, ls, unrolled, last(order), U, t, mask)
                 # for u ∈ 0:U-1 #  U &&  T
-                lower!(blockq, ops[2,2,prepost,n], vectorized, W, unrolled, last(order), U, t, mask)
+                lower!(blockq, ops[2,2,prepost,n], vectorized, ls, unrolled, last(order), U, t, mask)
                 # end
             end
         end
@@ -146,9 +151,16 @@ end
 function initialize_outer_reductions!(
     q::Expr, op::Operation, Umin::Int, Umax::Int, W::Symbol, typeT::Symbol, vectorized::Symbol, suffix::Union{Symbol,Nothing} = nothing
 )
-    z = Expr(:call, reduction_zero(op.instruction), typeT)
-    if vectorized ∈ reduceddependencies(op)
-        z = Expr(:call, lv(:vbroadcast), W, z)
+    reduct_zero = reduction_zero(op.instruction)
+    isvectorized = vectorized ∈ reduceddependencies(op)
+    z = if isvectorized
+        if reduct_zero === :zero
+            Expr(:call, lv(:vzero), W, typeT)
+        else
+            Expr(:call, lv(:vbroadcast), W, Expr(:call, reduct_zero, typeT))
+        end
+    else
+        Expr(:call, reduct_zero, typeT)
     end
     mvar = variable_name(op, suffix)
     for u ∈ Umin:Umax-1
@@ -362,20 +374,6 @@ end
 @inline sizeequivalentint(::Type{Float16}, x::Int64) = Int16(x)
 @inline sizeequivalentint(::Type{Float16}, x::Int32) = Int16(x)
 
-function setop!(ls, op, val)
-    if instruction(op) === LOOPCONSTANT# && mangledvar(op) !== val
-        pushpreamble!(ls, Expr(:(=), mangledvar(op), val))
-    else
-        pushpreamble!(ls, Expr(:(=), instruction(op).instr, val))
-    end
-    nothing
-end
-function setconstantop!(ls, op, val)
-    if instruction(op) === LOOPCONSTANT# && mangledvar(op) !== val
-        pushpreamble!(ls, Expr(:(=), mangledvar(op), val))
-    end
-    nothing
-end
 
 function setup_preamble!(ls::LoopSet, W::Symbol, typeT::Symbol, vectorized::Symbol, unrolled::Symbol, tiled::Symbol, U::Int)
     # println("Setup preamble")
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,7 +1,7 @@
 using Test
 using LoopVectorization
 using LinearAlgebra
-T = Float32
+# T = Float32
 
 
 function clenshaw(x,coeff)
@@ -1188,6 +1188,11 @@ end
             ret[j] = clenshaw(x[j], coeff)
         end
     end
+    function clenshawavx!(ret,x,coeff)
+        @avx for j in 1:length(ret)
+            ret[j] = clenshaw(x[j], coeff)
+        end
+    end
 
     function softmax3_core!(lse, qq, xx, tmpmax, maxk, nk)
         for k in Base.OneTo(maxk)
@@ -1443,6 +1448,8 @@ end
         clenshaw!(y1,x,c)
         clenshaw_avx!(y2,x,c)
         @test y1 ≈ y2
+        clenshawavx!(y2,x,c)
+        @test y1 ≈ y2
 
 
         ni, nj, nk = (100, 100, 10)