Merge branch 'graphs' of https://github.com/chriselrod/LoopVectorization.jl into graphs

chriselrod · chriselrod · commit 2c99f6a1892f · 2019-11-21T18:43:18.000-05:00
diff --git a/README.md b/README.md
@@ -14,3 +14,43 @@ Pkg.add(PackageSpec(url="https://github.com/chriselrod/SIMDPirates.jl"))
 Pkg.add(PackageSpec(url="https://github.com/chriselrod/SLEEFPirates.jl"))
 Pkg.add(PackageSpec(url="https://github.com/chriselrod/LoopVectorization.jl"))
 ```
+
+
+## Usage
+
+The current version of LoopVectorization provides a simple, dumb, transform on a single loop.
+What I mean by this is that it will not check for the transformations for validity. To be safe, I would straight loops that transform arrays or calculate reductions.
+
+For example,
+```julia
+function sum_simd(x)
+    s = zero(eltype(x))
+    @simd for xᵢ ∈ x
+        s += xᵢ
+    end
+    s
+end
+using LoopVectorization, BenchmarkTools
+function sum_loopvec(x::AbstractVector{Float64})
+    s = 0.0
+    @vvectorize 4 for i ∈ eachindex(x)
+        s += x[i]
+    end
+    s
+end
+x = rand(110);
+@btime sum($x)
+#   20.527 ns (0 allocations: 0 bytes)
+# 53.38001667116997
+
+@btime sum_simd($x)
+#   16.749 ns (0 allocations: 0 bytes)
+# 53.38001667116997
+
+@btime sum_loopvec($x)
+#   12.022 ns (0 allocations: 0 bytes)
+# 53.38001667116997
+```
+
+
+
diff --git a/src/costs.jl b/src/costs.jl
@@ -9,49 +9,87 @@
 struct InstructionCost
     scalar_latency::Int
     scalar_reciprical_throughput::Float64
-    scaling::Float64 # sentinel values: -2 == no scaling; -1 == scaling, >0 ->  == latency == reciprical throughput
-    
+    scaling::Float64 # sentinel values: -3 == no scaling; -2 == offset_scaling, -1 == linear scaling, >0 ->  == latency == reciprical throughput
+    register_pressure::Int
 end
-InstructionCost(sl, srt) = InstructionCost(sl, srt, NoCost)
+InstructionCost(sl, srt, scaling = -3.0) = InstructionCost(sl, srt, scaling, 0)
 
 function scalar_cost(instruction::InstructionCost)#, ::Type{T} = Float64) where {T}
     instruction.scalar_latency, instruction.scalar_reciprical_throughput
 end
 function vector_cost(instruction::InstructionCost, Wshift, ::Type{T} = Float64) where {T}
     sl, srt = scalar_cost(instruction)
     scaling = instruction.scaling
-    if scaling == NoCost || Wshift == 0
-        returnsl, srt
-    elseif scaling == Linear
+    if scaling == -3.0 || Wshift == 0
+        return sl, srt
+    elseif scaling == -2.0
         srt *= 1 << (Wshift + VectorizationBase.intlog2(sizeof(T)) - 4)
         if (sizeof(T) << Wshift) == VectorizationBase.REGISTER_SIZE # These instructions experience double latency with zmm
             sl += sl
         end
-    end
-    
+    elseif scaling == -1.0
+        W = 1 << Wshift
+        extra_latency = sl - srt
+        srt *= W
+        sl = srt + extra_latency
+    else
+        sl, srt = scaling, scaling
+    end    
     sl, srt
 end
 function cost(instruction::InstructionCost, Wshift, ::Type{T}) where {T}
     Wshift == 0 ? scalar_cost(instruction) : vector_cost(instruction, Wshift, T)
 end
 
+# Just a semi-reasonable assumption; should not be that sensitive to anything other than loads
+const OPAQUE_INSTRUCTION = InstructionSet(50.0, 50.0, -1.0, 32)
+
 const COST = Dict{Symbol,InstructionCost}(
     :getindex => InstructionCost(3,0.5),
     :setindex! => InstructionCost(3,1.0), # but not a part of dependency chains, so not really twice as expensive?
     :+ => InstructionCost(4,0.5),
     :- => InstructionCost(4,0.5),
     :* => InstructionCost(4,0.5),
-    :/ => InstructionCost(13,4.0,),
+    :/ => InstructionCost(13,4.0,-2.0),
+    :== => InstructionCost(1, 0.5),
+    :isequal => InstructionCost(1, 0.5),
+    :& => InstructionCost(1, 0.5),
+    :| => InstructionCost(1, 0.5),
+    :> => InstructionCost(1, 0.5),
+    :< => InstructionCost(1, 0.5),
+    :>= => InstructionCost(1, 0.5),
+    :<= => InstructionCost(1, 0.5),
+    :inv => InstructionCost(13,4.0,-2.0,1),
     :muladd => InstructionCost(0.5,4), # + and * will fuse into this, so much of the time they're not twice as expensive
-    :sqrt => InstructionCost(),
-    :log => InstructionCost(,,52.5),
-    :exp => InstructionCost(,,30.0),
-    :sin => InstructionCost(),
-    :cos => InstructionCost(),
-    :sincos => InstructionCost(),
-    :
+    :sqrt => InstructionCost(15,4.0,-2.0),
+    :log => InstructionCost(20,20.0,40.0,20),
+    :exp => InstructionCost(20,20.0,20.0,18),
+    :sin => InstructionCost(18,15.0,68.0,23),
+    :cos => InstructionCost(18,15.0,68.0,26),
+    :sincos => InstructionCost(25,22.0,70.0,26)
 )
 
+function sum_simd(x)
+    s = zero(eltype(x))
+    @simd for xᵢ ∈ x
+        s += xᵢ
+    end
+    s
+end
+using LoopVectorization, BenchmarkTools
+function sum_loopvec(x::AbstractVector{Float64})
+    s = 0.0
+    @vvectorize 4 for i ∈ eachindex(x)
+        s += x[i]
+    end
+    s
+end
+x = rand(111);
+@btime sum($x)
+@btime sum_simd($x)
+@btime sum_loopvec($x)
+
+
 # const SIMDPIRATES_COST = Dict{Symbol,InstructionCost}()
 # const SLEEFPIRATES_COST = Dict{Symbol,InstructionCost}()
 
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -1,17 +1,27 @@
 # using LightGraphs
 
-struct ShortIntVector
-    data::Vector{Int}
+
+"""
+ShortVector{T} simply wraps a Vector{T}, but uses a different hash function that is faster for short vectors to support using it as the keys of a Dict.
+This hash function scales O(N) with length of the vectors, so it is slow for long vectors.
+"""
+struct ShortVector{T} <: DenseVector{T}
+    data::Vector{T}
 end
-function Base.hash(x::ShortIntVector, h::UInt)
-    d = x.data
-    @inbounds for n ∈ eachindex(d)
-        h = hash(d[n], h)
+Base.@propagate_inbounds Base.getindex(x::ShortVector, I...) = x.data[I...]
+Base.@propagate_inbounds Base.setindex!(x::ShortVector, v, I...) = x.data[I...] = v
+@inbounds Base.length(x::ShortVector) = length(x.data)
+@inbounds Base.size(x::ShortVector) = size(x.data)
+@inbounds Base.strides(x::ShortVector) = strides(x.data)
+@inbounds Base.push!(x::ShortVector, v) = push!(x.data, v)
+@inbounds Base.append!(x::ShortVector, v) = append!(x.data, v)
+function Base.hash(x::ShortVector, h::UInt)
+    @inbounds for n ∈ eachindex(x)
+        h = hash(x[n], h)
     end
     h
 end
 
-
 @enum NodeType begin
     input
     store
@@ -27,11 +37,128 @@ struct LoopSet
     
 end
 
+function Base.length(ls::LoopSet, is::Symbol)
+
+end
+function variables(ls::LoopSet)
+
+end
+function loopdependencies(var::Variable)
+
+end
+function sym(var::Variable)
+
+end
+function instruction(var::Variable)
+
+end
+function accesses_memory(var::Variable)
+
+end
+function stride(var::Variable, sym::Symbol)
+
+end
+function cost(var::Variable, unrolled::Symbol, dim::Int)
+    c = cost(instruction(var), Wshift, T)::Int
+    if accesses_memory(var) && stride(var, unrolled) != 1
+        c *= W
+    end
+    c
+end
+function Base.eltype(var::Variable)
+    Base._return_type()
+end
+function biggest_type(ls::LoopSet)
+
+end
+
 # evaluates cost of evaluating loop in given order
 function evaluate_cost(
-    ls::LoopSet, order::ShortIntVector
+    ls::LoopSet, order::ShortVector{Symbol}, max_cost = typemax(Int)
 )
-    
+    included_vars = Set{Symbol}()
+    nested_loop_syms = Set{Symbol}()
+    total_cost = 0.0
+    iter = 1.0
+    unrolled = last(order)
+    W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, unrolled), biggest_type(ls))::Tuple{Int,Int}
+
+    fused_with_previous = fill(false, length(order))
+    for itersym ∈ order
+        # Add to set of defined symbles
+        push!(nested_loop_syms, itersym)
+        liter = length(ls, itersym)
+        if itersym == unrolled
+            liter /= W
+        end
+        iter *= liter
+        # check which vars we can define at this level of loop nest
+        added_vars = 0
+        for (var,instruction) ∈ variables(ls)
+            # won't define if already defined...
+            sym(var) ∈ included_vars && continue
+            # it must also be a subset of defined symbols
+            loopdependencies(var) ⊆ nested_loop_syms || continue
+            added_vars += 1
+            push!(included_vars, sym(var))
+            
+            total_cost += iter * cost(var, W, Wshift, unrolled, liter)
+            total_cost > max_cost && return total_cost # abort
+        end
+        if added_vars == 0
+            # Then it is worth checking if we can fuse with previous
+        end
+    end
+end
+
+struct LoopOrders
+    syms::Vector{Symbol}
+end
+function Base.iterate(lo::LoopOrders)
+    ShortVector(lo.syms), zeros(Int, length(lo.syms))# - 1)
+end
+
+function swap!(x, i, j)
+    xᵢ, xⱼ = x[i], x[j]
+    x[j], x[i] = xᵢ, xⱼ
+end
+function advance_state!(state)
+    N = length(state)
+    for n ∈ 1:N
+        sₙ = state[n]
+        if sₙ == N - n
+            if n == N
+                return false
+            else
+                state[n] = 0
+            end
+        else
+            state[n] = sₙ + 1
+            break
+        end
+    end
+    true
+end
+# I doubt this is the most efficient algorithm, but it's the simplest thing
+# that I could come up with.
+function Base.iterate(lo::LoopOrders, state)
+    advance_state!(state) || return nothing
+    # @show state
+    syms = copy(lo.syms)
+    for i ∈ eachindex(state)
+        sᵢ = state[i]
+        sᵢ == 0 || swap!(syms, i, i + sᵢ)
+    end
+    ShortVector(syms), state
+end
+
+function choose_order(ls::LoopSet)
+    is = copy(itersyms(ls))
+    best_cost = typemax(Int)
+    for lo ∈ LoopOrders(ls)
+        cost = evaluate_cost(ls, lo)
+        
+    end
 end
 
 # Here, we have to figure out how to convert the loopset into a vectorized expression.
diff --git a/test/runtests.jl b/test/runtests.jl