Minor progress.

chriselrod · chriselrod · commit bc6a8879beeb · 2019-11-27T15:39:14.000-05:00
diff --git a/src/costs.jl b/src/costs.jl
@@ -17,14 +17,14 @@ InstructionCost(sl, srt, scaling = -3.0) = InstructionCost(sl, srt, scaling, 1)
 function scalar_cost(instruction::InstructionCost)#, ::Type{T} = Float64) where {T}
     instruction.scalar_latency, instruction.scalar_reciprical_throughput
 end
-function vector_cost(instruction::InstructionCost, Wshift, ::Type{T} = Float64) where {T}
+function vector_cost(instruction::InstructionCost, Wshift, sizeof_T)
     sl, srt = scalar_cost(instruction)
     scaling = instruction.scaling
     if scaling == -3.0 || Wshift == 0
         return sl, srt
     elseif scaling == -2.0
-        srt *= 1 << (Wshift + VectorizationBase.intlog2(sizeof(T)) - 4)
-        if (sizeof(T) << Wshift) == VectorizationBase.REGISTER_SIZE # These instructions experience double latency with zmm
+        srt *= 1 << (Wshift + VectorizationBase.intlog2(sizeof_T) - 4)
+        if (sizeof_T << Wshift) == 64 # VectorizationBase.REGISTER_SIZE # These instructions experience double latency with zmm
             sl += sl
         end
     elseif scaling == -1.0
@@ -37,28 +37,35 @@ function vector_cost(instruction::InstructionCost, Wshift, ::Type{T} = Float64)
     end    
     sl, srt
 end
-function cost(instruction::InstructionCost, Wshift, ::Type{T}) where {T}
-    Wshift == 0 ? scalar_cost(instruction) : vector_cost(instruction, Wshift, T)
+function cost(instruction::InstructionCost, Wshift, sizeof_T)
+    Wshift == 0 ? scalar_cost(instruction) : vector_cost(instruction, Wshift, sizeof_T)
+end
+
+function cost(instruction::Symbol, Wshift, sizeof_T)
+    cost(
+        get(COST, instruction, OPAQUE_INSTRUCTION),
+        Wshift, sizeof_T
+    )
 end
 
 # Just a semi-reasonable assumption; should not be that sensitive to anything other than loads
-const OPAQUE_INSTRUCTION = InstructionSet(50.0, 50.0, -1.0, VectorizationBase.REGISTER_COUNT)
+const OPAQUE_INSTRUCTION = InstructionCost(50, 50.0, -1.0, VectorizationBase.REGISTER_COUNT)
 
 const COST = Dict{Symbol,InstructionCost}(
     :getindex => InstructionCost(3,0.5),
     :setindex! => InstructionCost(3,1.0), # but not a part of dependency chains, so not really twice as expensive?
-    :+ => InstructionCost(4,0.5),
-    :- => InstructionCost(4,0.5),
-    :* => InstructionCost(4,0.5),
-    :/ => InstructionCost(13,4.0,-2.0),
-    :== => InstructionCost(1, 0.5),
+    :(+) => InstructionCost(4,0.5),
+    :(-) => InstructionCost(4,0.5),
+    :(*) => InstructionCost(4,0.5),
+    :(/) => InstructionCost(13,4.0,-2.0),
+    :(==) => InstructionCost(1, 0.5),
     :isequal => InstructionCost(1, 0.5),
-    :& => InstructionCost(1, 0.5),
-    :| => InstructionCost(1, 0.5),
-    :> => InstructionCost(1, 0.5),
-    :< => InstructionCost(1, 0.5),
-    :>= => InstructionCost(1, 0.5),
-    :<= => InstructionCost(1, 0.5),
+    :(&) => InstructionCost(1, 0.5),
+    :(|) => InstructionCost(1, 0.5),
+    :(>) => InstructionCost(1, 0.5),
+    :(<) => InstructionCost(1, 0.5),
+    :(>=) => InstructionCost(1, 0.5),
+    :(<=) => InstructionCost(1, 0.5),
     :inv => InstructionCost(13,4.0,-2.0,2),
     :muladd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
     :fma => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
@@ -76,26 +83,6 @@ const COST = Dict{Symbol,InstructionCost}(
     :sincos => InstructionCost(25,22.0,70.0,27)
 )
 
-function sum_simd(x)
-    s = zero(eltype(x))
-    @simd for xᵢ ∈ x
-        s += xᵢ
-    end
-    s
-end
-using LoopVectorization, BenchmarkTools
-function sum_loopvec(x::AbstractVector{Float64})
-    s = 0.0
-    @vvectorize 4 for i ∈ eachindex(x)
-        s += x[i]
-    end
-    s
-end
-x = rand(111);
-@btime sum($x)
-@btime sum_simd($x)
-@btime sum_loopvec($x)
-
 
 # const SIMDPIRATES_COST = Dict{Symbol,InstructionCost}()
 # const SLEEFPIRATES_COST = Dict{Symbol,InstructionCost}()
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -3,10 +3,32 @@
 
 isdense(::Type{<:DenseArray}) = true
 
+"""
+ShortVector{T} simply wraps a Vector{T}, but uses a different hash function that is faster for short vectors to support using it as the keys of a Dict.
+This hash function scales O(N) with length of the vectors, so it is slow for long vectors.
+"""
+struct ShortVector{T} <: DenseVector{T}
+    data::Vector{T}
+end
+Base.@propagate_inbounds Base.getindex(x::ShortVector, I...) = x.data[I...]
+Base.@propagate_inbounds Base.setindex!(x::ShortVector, v, I...) = x.data[I...] = v
+@inbounds Base.length(x::ShortVector) = length(x.data)
+@inbounds Base.size(x::ShortVector) = size(x.data)
+@inbounds Base.strides(x::ShortVector) = strides(x.data)
+@inbounds Base.push!(x::ShortVector, v) = push!(x.data, v)
+@inbounds Base.append!(x::ShortVector, v) = append!(x.data, v)
+function Base.hash(x::ShortVector, h::UInt)
+    @inbounds for n ∈ eachindex(x)
+        h = hash(x[n], h)
+    end
+    h
+end
+
+
+
 @enum NodeType begin
     memload
     memstore
-    reduction
     compute
 end
 
@@ -15,61 +37,62 @@ struct Operation
     elementbytes::Int
     instruction::Symbol
     node_type::NodeType
+    # dependencies::ShortVector{Symbol}
+    dependencies::Set{Symbol}
+    # dependencies::Set{Symbol}
     parents::Vector{Operation}
     children::Vector{Operation}
-    metadata::Vector{Float64}
+    numerical_metadata::Vector{Float64}
+    symbolic_metadata::Vector{Symbol}
     function Operation(elementbytes, instruction, node_type)
         new(
             elementbytes, instruction, node_type,
-            Operation[], Operation[], Float64[]
+            Set{Symbol}(), Operation[], Operation[], Float64[], Symbol[]
         )
     end
 end
 
-isreduction(op::Operation) = op.node_type == reduction
+function isreduction(op::Operation)
+    (op.node_type == memstore) && (length(op.symbolic_metadata) < length(op.dependencies)) && issubset(op.symbolic_metadata, op.dependencies)
+end
 isload(op::Operation) = op.node_type == memload
 isstore(op::Operation) = op.node_type == memstore
 accesses_memory(op::Operation) = isload(op) | isstore(op)
-Base.eltype(var::Operation) = op.outtype
-
-"""
-ShortVector{T} simply wraps a Vector{T}, but uses a different hash function that is faster for short vectors to support using it as the keys of a Dict.
-This hash function scales O(N) with length of the vectors, so it is slow for long vectors.
-"""
-struct ShortVector{T} <: DenseVector{T}
-    data::Vector{T}
-end
-Base.@propagate_inbounds Base.getindex(x::ShortVector, I...) = x.data[I...]
-Base.@propagate_inbounds Base.setindex!(x::ShortVector, v, I...) = x.data[I...] = v
-@inbounds Base.length(x::ShortVector) = length(x.data)
-@inbounds Base.size(x::ShortVector) = size(x.data)
-@inbounds Base.strides(x::ShortVector) = strides(x.data)
-@inbounds Base.push!(x::ShortVector, v) = push!(x.data, v)
-@inbounds Base.append!(x::ShortVector, v) = append!(x.data, v)
-function Base.hash(x::ShortVector, h::UInt)
-    @inbounds for n ∈ eachindex(x)
-        h = hash(x[n], h)
-    end
-    h
-end
+elsize(op::Operation) = op.elementbytes
+dependson(op::Operation, sym::Symbol) = sym ∈ op.dependencies
 
 function stride(op::Operation, sym::Symbol)
     @assert accesses_memory(op) "This operation does not access memory!"
     # access stride info?
 end
-function
+# function
 
 struct Node
     type::DataType
 end
 
+struct Loop
+    itersymbol::Symbol
+    rangehint::Int
+    rangesym::Symbol
+    hintexact::Bool # if true, rangesym ignored and rangehint used for final lowering
+end
+function Loop(itersymbol::Symbol, rangehint::Int)
+    Loop( itersymbol, rangehint, :undef, true )
+end
+function Loop(itersymbol::Symbol, rangesym::Symbol, rangehint::Int = 1_000_000)
+    Loop( itersymbol, rangehint, rangesym, false )
+end
+
 # Must make it easy to iterate
 struct LoopSet
+    loops::Dict{Symbol,Loop} # sym === loops[sym].itersymbol
+    operations::Vector{Operation}
     
 end
 
 function Base.length(ls::LoopSet, is::Symbol)
-
+    ls.loops[is].rangehint
 end
 function variables(ls::LoopSet)
 
@@ -78,7 +101,7 @@ function loopdependencies(var::Operation)
 
 end
 function sym(var::Operation)
-
+    
 end
 function instruction(var::Operation)
 
@@ -89,6 +112,7 @@ end
 function stride(var::Operation, sym::Symbol)
 
 end
+operations(ls::LoopSet) = ls.operations
 function cost(var::Operation, unrolled::Symbol, dim::Int)
     c = cost(instruction(var), Wshift, T)::Int
     if accesses_memory(var)
@@ -108,31 +132,31 @@ end
     # Base._return_type()
 
 function biggest_type(ls::LoopSet)
-
+    maximum(elsize, ls.operations)
 end
 
 
 
 # evaluates cost of evaluating loop in given order
 function evaluate_cost_unroll(
-    ls::LoopSet, order::ShortVector{Symbol}, unrolled::Symbol, max_cost = typemax(Int)
+    ls::LoopSet, order::ShortVector{Symbol}, unrolled::Symbol, max_cost = typemax(Float64)
 )
     included_vars = Set{Symbol}()
     nested_loop_syms = Set{Symbol}()
     total_cost = 0.0
     iter = 1.0
     # Need to check if fusion is possible
-    # W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, unrolled), biggest_type(ls))::Tuple{Int,Int}
+    W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, unrolled), biggest_type(ls))::Tuple{Int,Int}
     for itersym ∈ order
         # Add to set of defined symbles
         push!(nested_loop_syms, itersym)
-        liter = length(ls, itersym)
+        liter = Float64(length(ls, itersym))
         if itersym == unrolled
             liter /= W
         end
         iter *= liter
         # check which vars we can define at this level of loop nest
-        for var ∈ variables(ls)
+        for var ∈ operations(ls)
             # won't define if already defined...
             sym(var) ∈ included_vars && continue
             # it must also be a subset of defined symbols
@@ -141,14 +165,48 @@ function evaluate_cost_unroll(
             push!(included_vars, sym(var))
             
             total_cost += iter * cost(var, W, Wshift, unrolled, liter)
-            total_cost > max_cost && return total_cost # abort
+            total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
         end
     end
+    total_cost
 end
-function evaluate_cost_tile(
-    ls::LoopSet, order::ShortVector{Symbol}, tiler, tilec, max_cost = typemax(Int)
+
+# only covers unrolled ops; everything else considered lifted?
+function depchain_cost!(
+    skip::Set{Symbol}, ls::LoopSet, op::Operation, unrolled::Symbol, Wshift::Int, size_T::Int
+)
+    
+end
+   
+function determine_unroll_factor(
+    ls::LoopSet, order::ShortVector{Symbol}, unrolled::Symbol, Wshift::Int, size_T::Int
 )
+    # The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
+    # The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
+    num_reductions = sum(isreduction, operations(ls))
+    iszero(num_reductions) && return 1
+    # So if num_reductions > 0, we set the unroll factor to be high enough so that the CPU can be kept busy
+    # if there are, U = max(1, round(Int, max(latency) * throughput / num_reductions)) = max(1, round(Int, latency / (recip_througput * num_reductions)))
+    latency = 0
+    recip_throughput = 0.0
+    visited_nodes = Set{Symbol}()
+    for op ∈ operations(ls)
+        if isreduction(op) && dependson(op, unrolled)
+            l, rt = cost_of_chain()
+            num_reductions += 1
+            sl, rt = cost(instruction(op), Wshift, size_T)
+            latency = max(sl, latency)
+            recip_throughput += rt
+        end
+    end
+    
 
+    
+end
+function evaluate_cost_tile(
+    ls::LoopSet, order::ShortVector{Symbol}, tiler, tilec, max_cost = typemax(Float64)
+)
+    
 end
 
 struct LoopOrders
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,6 +1,25 @@
 using LoopVectorization
 using Test
 
+pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
+using VectorizationBase, SIMDPirates, SLEEFPirates
+# includet(joinpath(pkgdir("LoopVectorization"), "src/costs.jl"))
+# includet(joinpath(pkgdir("LoopVectorization"), "src/graphs.jl"))
+include(joinpath(pkgdir("LoopVectorization"), "src/costs.jl"))
+include(joinpath(pkgdir("LoopVectorization"), "src/graphs.jl"))
+
+# loop is gemv!
+for c ∈ 1:C
+    for r ∈ 1:R
+        y[r] += A[r,c] * x[c]
+        # translates to
+        # y[r] = vmuladd(A[r,c], x[c], y[r])
+    end
+end
+
+         
+
+
 using CpuId, VectorizationBase, SIMDPirates, SLEEFPirates, VectorizedRNG
 
 @generated function estimate_cost_onearg_serial(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}