Working on computing costs of different loop orderings and unrolling strategies.

chriselrod · chriselrod · commit 4cf6786001b9 · 2019-11-20T19:21:42.000-05:00
diff --git a/src/costs.jl b/src/costs.jl
@@ -0,0 +1,63 @@
+
+
+# @enum CostScaling begin
+    # NoCost
+    # Linear
+    # Unique
+# end
+
+struct InstructionCost
+    scalar_latency::Int
+    scalar_reciprical_throughput::Float64
+    scaling::Float64 # sentinel values: -2 == no scaling; -1 == scaling, >0 ->  == latency == reciprical throughput
+    
+end
+InstructionCost(sl, srt) = InstructionCost(sl, srt, NoCost)
+
+function scalar_cost(instruction::InstructionCost)#, ::Type{T} = Float64) where {T}
+    instruction.scalar_latency, instruction.scalar_reciprical_throughput
+end
+function vector_cost(instruction::InstructionCost, Wshift, ::Type{T} = Float64) where {T}
+    sl, srt = scalar_cost(instruction)
+    scaling = instruction.scaling
+    if scaling == NoCost || Wshift == 0
+        returnsl, srt
+    elseif scaling == Linear
+        srt *= 1 << (Wshift + VectorizationBase.intlog2(sizeof(T)) - 4)
+        if (sizeof(T) << Wshift) == VectorizationBase.REGISTER_SIZE # These instructions experience double latency with zmm
+            sl += sl
+        end
+    end
+    
+    sl, srt
+end
+function cost(instruction::InstructionCost, Wshift, ::Type{T}) where {T}
+    Wshift == 0 ? scalar_cost(instruction) : vector_cost(instruction, Wshift, T)
+end
+
+const COST = Dict{Symbol,InstructionCost}(
+    :getindex => InstructionCost(3,0.5),
+    :setindex! => InstructionCost(3,1.0), # but not a part of dependency chains, so not really twice as expensive?
+    :+ => InstructionCost(4,0.5),
+    :- => InstructionCost(4,0.5),
+    :* => InstructionCost(4,0.5),
+    :/ => InstructionCost(13,4.0,),
+    :muladd => InstructionCost(0.5,4), # + and * will fuse into this, so much of the time they're not twice as expensive
+    :sqrt => InstructionCost(),
+    :log => InstructionCost(,,52.5),
+    :exp => InstructionCost(,,30.0),
+    :sin => InstructionCost(),
+    :cos => InstructionCost(),
+    :sincos => InstructionCost(),
+    :
+)
+
+# const SIMDPIRATES_COST = Dict{Symbol,InstructionCost}()
+# const SLEEFPIRATES_COST = Dict{Symbol,InstructionCost}()
+
+# const MODULE_LOOKUP = Dict{Symbol,Dict{Symbol,InstructionCost}}(
+    # :Base => BASE_COST,
+    # :SIMDPirates => SIMDPIRATES_COST,
+    # :SLEEFPirates => SLEEFPIRATES_COST
+# )
+
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -1,14 +1,86 @@
-using LightGraphs
+# using LightGraphs
 
+struct ShortIntVector
+    data::Vector{Int}
+end
+function Base.hash(x::ShortIntVector, h::UInt)
+    d = x.data
+    @inbounds for n ∈ eachindex(d)
+        h = hash(d[n], h)
+    end
+    h
+end
 
 
+@enum NodeType begin
+    input
+    store
+    reduction
+end
+
+struct Node
+    type::DataType
+end
 
+# Must make it easy to iterate
 struct LoopSet
+    
+end
+
+# evaluates cost of evaluating loop in given order
+function evaluate_cost(
+    ls::LoopSet, order::ShortIntVector
+)
+    
+end
 
+# Here, we have to figure out how to convert the loopset into a vectorized expression.
+# This must traverse in a parent -> child pattern
+# but order is also dependent on which loop inds they depend on.
+# Requires sorting 
+function lower(ls::LoopSet)
+
+end
+
+function Base.convert(::Type{Expr}, ls::LoopSet)
+    lower(ls)
 end
-  
 
 
 
 
+using BenchmarkTools, LoopVectorization, SLEEF
+θ = randn(1000); c = randn(1000);
+function sumsc_vectorized(θ::AbstractArray{Float64}, coef::AbstractArray{Float64})
+    s, c = 0.0, 0.0
+    @vvectorize for i ∈ eachindex(θ, coef)
+        sinθᵢ, cosθᵢ = sincos(θ[i])
+        s += coef[i] * sinθᵢ
+        c += coef[i] * cosθᵢ
+    end
+    s, c
+end
+function sumsc_serial(θ::AbstractArray{Float64}, coef::AbstractArray{Float64})
+    s, c = 0.0, 0.0
+    @inbounds for i ∈ eachindex(θ, coef)
+        sinθᵢ, cosθᵢ = sincos(θ[i])
+        s += coef[i] * sinθᵢ
+        c += coef[i] * cosθᵢ
+    end
+    s, c
+end
+function sumsc_sleef(θ::AbstractArray{Float64}, coef::AbstractArray{Float64})
+    s, c = 0.0, 0.0
+    @inbounds @simd for i ∈ eachindex(θ, coef)
+        sinθᵢ, cosθᵢ = SLEEF.sincos_fast(θ[i])
+        s += coef[i] * sinθᵢ
+        c += coef[i] * cosθᵢ
+    end
+    s, c
+end
+
+@btime sumsc_serial($θ, $c)
+@btime sumsc_sleef($θ, $c)
+@btime sumsc_vectorized($θ, $c)
+
 
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,6 +1,52 @@
 using LoopVectorization
 using Test
 
+using CpuId, VectorizationBase, SIMDPirates, SLEEFPirates
+@generated function estimate_cost(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
+    W, Wshift = VectorizationBase.pick_vector_width_shift(T)
+    quote    
+        Base.Cartesian.@nexprs $U u -> s_u = vbroadcast(Vec{$W,$T}, zero(T))
+        # s = vbroadcast(V, zero(T))
+        x = rand(T, N << $Wshift)
+        ptrx = pointer(x)
+        ts_start, id_start = cpucycle_id()
+        for k ∈ 1:K
+            _ptrx = ptrx
+            for n ∈ 1:N>>$(VectorizationBase.intlog2(U))
+                Base.Cartesian.@nexprs $U u -> begin
+                    v_u = vload(Vec{$W,$T}, _ptrx)
+                    s_u = vadd(s_u, f(v_u))
+                    _ptrx += VectorizationBase.REGISTER_SIZE
+                end
+                # v = vload(V, _ptrx)
+                # s = vadd(s, f(v))
+                # _ptrx += VectorizationBase.REGISTER_SIZE
+            end
+        end
+        ts_end, id_end = cpucycle_id()
+        @assert id_start == id_end
+        Base.Cartesian.@nexprs $(U-1) u -> s_1 = vadd(s_1, s_{u+1})
+        (ts_end - ts_start) / (N*K), vsum(s_1)
+    end
+end
+estimate_cost(SLEEFPirates.exp, 512, 1_000, Float64, Val(4)) # 28
+
+estimate_cost(SLEEFPirates.log, 512, 1_000, Float64, Val(1)) # 51 cycles
+estimate_cost(SLEEFPirates.log, 512, 1_000, Float64, Val(2)) # 51 cycles
+estimate_cost(SLEEFPirates.log, 512, 1_000, Float64, Val(4)) # 51 cycles
+estimate_cost(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(1)) # 23 cycles
+estimate_cost(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(2)) # 23 cycles
+estimate_cost(SIMDPirates.vsqrt, 512, 1_000, Float64, Val(4)) # 23 cycles
+estimate_cost(SIMDPirates.vinv, 512, 1_000, Float64, Val(1)) # 23 cycles
+estimate_cost(SIMDPirates.vinv, 512, 1_000, Float64, Val(2)) # 23 cycles
+estimate_cost(SIMDPirates.vinv, 512, 1_000, Float64, Val(4)) # 23 cycles
+
+const cz = ntuple(Val(4)) do i Core.VecElement(randn()) end
+# @code_native debuginfo=:none
+estimate_cost(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(1)) # 4.5 cycles
+estimate_cost(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(2)) # 2 cycles
+estimate_cost(x -> SIMDPirates.vmul(x,cz), 1<<9, 10^3, Float64, Val(4)) # 1 cycles
+
 @testset "LoopVectorization.jl" begin
     # Write your own tests here.
 end