Make a few cost updates

chriselrod · chriselrod · commit 8aec89391587 · 2021-03-09T22:41:12.000-05:00
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -32,7 +32,7 @@ using Base.Meta: isexpr
 using DocStringExtensions
 import LinearAlgebra # for check_args
 
-using Base.FastMath: add_fast, sub_fast, mul_fast, div_fast, inv_fast, abs2_fast, rem_fast, max_fast, min_fast
+using Base.FastMath: add_fast, sub_fast, mul_fast, div_fast, inv_fast, abs2_fast, rem_fast, max_fast, min_fast, log_fast, log2_fast, log10_fast
 
 
 using ArrayInterface
diff --git a/src/modeling/costs.jl b/src/modeling/costs.jl
@@ -83,9 +83,9 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
         extra_latency = sl - srt
         srt *= W
         sl = round(Int, srt + extra_latency)
-    else # we assume custom cost, and that latency == recip_throughput
-        scaling = ic.scaling
-        sl, srt = round(Int,scaling), scaling
+    # else # we assume custom cost, and that latency == recip_throughput
+    #     scaling = ic.scaling
+    #     sl, srt = round(Int,scaling), scaling
     end
     srt, sl, srp
 end
@@ -224,28 +224,32 @@ const COST = Dict{Symbol,InstructionCost}(
     # :vdivlog10add! =>InstructionCost(13,4.0,-2.0),
     :sqrt => InstructionCost(15,4.0,-2.0),
     :sqrt_fast => InstructionCost(15,4.0,-2.0),
-    :log => InstructionCost(20,20.0,20.0,20),
-    :log1p => InstructionCost(20,25.0,25.0,20), # FIXME
-    :exp => InstructionCost(20,20.0,20.0,18),
-    :expm1 => InstructionCost(20,25.0,25.0,18), # FIXME
-    :(^) => InstructionCost(40,40.0,40.0,26), # FIXME
-    :sin => InstructionCost(18,15.0,68.0,23),
-    :cos => InstructionCost(18,15.0,68.0,26),
-    :sincos => InstructionCost(25,22.0,70.0,26),
+    :log => InstructionCost(-3.0, 15, 30, 11),
+    :log2 => InstructionCost(-3.0, 15, 30, 11),
+    :log10 => InstructionCost(-3.0, 15, 30, 11),
+    :log1p => InstructionCost(-3.0, 15, 30, 11),
+    :exp => InstructionCost(-3.0,13.0,26.0,14),
+    :exp2 => InstructionCost(-3.0,10.0,40.0,14),
+    :exp10 => InstructionCost(-3.0,13.0,26.0,14),
+    :expm1 => InstructionCost(-3.0,30.0,60.0,19),
+    :(^) => InstructionCost(-3.0,200.0,400.0,26), # FIXME
+    :sin => InstructionCost(-3,30.0,60.0,23),
+    :cos => InstructionCost(-3,27.0,60.0,26),
+    :sincos => InstructionCost(-3,37.0,85.0,26),
     :sinpi => InstructionCost(18,15.0,68.0,23),
     :cospi => InstructionCost(18,15.0,68.0,26),
-    :sincospi => InstructionCost(25,22.0,70.0,26),
+    :sincospi => InstructionCost(25,37.0,70.0,26),
     :log_fast => InstructionCost(20,20.0,40.0,20),
     :exp_fast => InstructionCost(20,20.0,20.0,18),
     :sin_fast => InstructionCost(18,15.0,68.0,23),
     :cos_fast => InstructionCost(18,15.0,68.0,26),
-    :sincos_fast => InstructionCost(25,22.0,70.0,26),
+    :sincos => InstructionCost(-3,37.0,85.0,26),
     :sinpi_fast => InstructionCost(18,15.0,68.0,23),
     :cospi_fast => InstructionCost(18,15.0,68.0,26),
     :sincospi_fast => InstructionCost(25,22.0,70.0,26),
-    :tanh => InstructionCost(40,40.0,40.0,26), # FIXME
-    :tanh_fast => InstructionCost(25,22.0,70.0,26), # FIXME
-    :sigmoid_fast => InstructionCost(25,22.0,70.0,26), # FIXME
+    :tanh => InstructionCost(-3.0,80.0,160.0,26), # FIXME
+    :tanh_fast => InstructionCost(-3.0,30.0,60.0,20), # FIXME
+    :sigmoid_fast => InstructionCost(-3.0,16.0,66.0,15), # FIXME
     :identity => InstructionCost(0,0.0,0.0,0),
     :adjoint => InstructionCost(0,0.0,0.0,0),
     :conj => InstructionCost(0,0.0,0.0,0),
@@ -548,11 +552,15 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
     typeof(Base.FastMath.sqrt_fast) => :sqrt,
     # typeof(VectorizationBase.vsqrt) => :sqrt,
     typeof(log) => :log,
+    typeof(log2) => :log2,
+    typeof(log10) => :log10,
     typeof(Base.FastMath.log_fast) => :log,
     typeof(log1p) => :log1p,
     # typeof(VectorizationBase.vlog) => :log,
     typeof(SLEEFPirates.log) => :log,
     typeof(exp) => :exp,
+    typeof(exp2) => :exp2,
+    typeof(exp10) => :exp10,
     typeof(Base.FastMath.exp_fast) => :exp,
     typeof(expm1) => :expm1,
     # typeof(VectorizationBase.vexp) => :exp,
diff --git a/src/modeling/determinestrategy.jl b/src/modeling/determinestrategy.jl
@@ -254,28 +254,38 @@ function unroll_no_reductions(ls, order, vloopsym)
         unrolled = order[end-1]
     end
     # latency not a concern, because no depchains
+    compute_l = 0.0
+    # rp = 0
     for op ∈ operations(ls)
         dependson(op, unrolled) || continue
+        rt, sl, rpop = cost(ls, op, (unrolled,Symbol("")), vloopsym, Wshift, size_T)
+        # rp += rpop
         if iscompute(op)
-            compute_rt += first(cost(ls, op, (unrolled,Symbol("")), vloopsym, Wshift, size_T))
+            compute_rt += rt
+            compute_l += sl
         elseif isload(op)
-            load_rt += first(cost(ls, op, (unrolled,Symbol("")), vloopsym, Wshift, size_T))
+            load_rt += rt
         elseif isstore(op)
-            store_rt += first(cost(ls, op, (unrolled,Symbol("")), vloopsym, Wshift, size_T))
+            store_rt += rt
         end
     end 
     # heuristic guess
     # roundpow2(min(4, round(Int, (compute_rt + load_rt + 1) / compute_rt)))
     memory_rt = load_rt + store_rt
+    @show memory_rt, load_rt, store_rt, compute_rt, compute_l
+    
     u = if compute_rt > memory_rt
-        max(1, VectorizationBase.nextpow2( min( 4, round(Int, 8 / compute_rt) ) ))
+        @show clamp(round(Int, compute_l / compute_rt), 1, 4)
+        # max(1, VectorizationBase.nextpow2( min( 4, round(Int, 8 / compute_rt) ) ))
     elseif iszero(compute_rt)
         4
     elseif iszero(load_rt)
         iszero(store_rt) ? 4 : max(1, min(4, round(Int, 2compute_rt / store_rt)))
     else
         max(1, min(4, round(Int, 2compute_rt / load_rt)))
     end
+    # u = min(u, max(1, (reg_count(ls) ÷ max(1,round(Int,rp)))))
+    # @show u
     # commented out here is to decide to align loops
     # if memory_rt > compute_rt && isone(u) && (length(order) > 1) && (last(order) === vloopsym) && length(getloop(ls, last(order))) > 8W
     #     ls.align_loops[] = findfirst(operations(ls)) do op
diff --git a/utils/generate_costs.jl b/utils/generate_costs.jl
@@ -0,0 +1,103 @@
+using VectorizationBase, LoopVectorization
+using VectorizationBase: data
+
+# @generated to use VectorizationBase's API for supporting 1.5 and 1.6+
+@generated function readcyclecounter()
+    decl = "declare i64 @llvm.readcyclecounter()"
+    instr = "%res = call i64 @llvm.readcyclecounter()\nret i64 %res"
+    VectorizationBase.llvmcall_expr(decl, instr, :Int64, :(Tuple{}), "i64", String[], Symbol[])
+end
+
+@generated function volatile(x::Vec{W,T}) where {W,T}
+    typ = VectorizationBase.LLVM_TYPES[T]
+    vtyp = "<$W x $typ>"
+
+    suffix = T == Float32 ? "ps" : "pd"
+    sideeffect_str = """%res = call <$W x $(typ)> asm sideeffect "", "=v,v"(<$W x $(typ)> %0)
+                               ret <$W x $(typ)> %res"""
+    quote
+        $(Expr(:meta, :inline))
+        Vec(Base.llvmcall($sideeffect_str, NTuple{$W,Core.VecElement{$T}}, Tuple{NTuple{$W,Core.VecElement{$T}}}, VectorizationBase.data(x)))
+    end
+end
+@inline volatile(x::VecUnroll) = VecUnroll(VectorizationBase.fmap(volatile, data(x)))
+@inline volatile(x::Tuple) = map(volatile, x)
+# @generated function volatile(x::Vec{W,T}, x::Vec{W,T}) where {W,T}
+#     typ = VectorizationBase.LLVM_TYPES[T]
+#     vtyp = "<$W x $typ>"
+
+#     suffix = T == Float32 ? "ps" : "pd"
+#     sideeffect_str = """%res = call <$W x $(typ)> asm sideeffect "", "=v,v"(<$W x $(typ)> %0, <$W x $(typ)> %1)
+#                                ret <$W x $(typ)> %res"""
+#     quote
+#         $(Expr(:meta, :inline))
+#         Vec(Base.llvmcall($sideeffect_str, NTuple{$W,Core.VecElement{$T}}, Tuple{NTuple{$W,Core.VecElement{$T}}}, VectorizationBase.data(x)))
+#     end
+# end
+
+num_vectors(::VecUnroll{N}) where {N} = N+1
+num_vectors(::Vec) = 1
+function unrolltest(f::F, vs::Vararg{Any,K}) where {F,K}
+    cc = readcyclecounter()
+    # num_iter = 1_048_576
+    num_iter = 4_194_304
+    for i ∈ 1:num_iter
+        volatile(f(map(volatile, vs)...))
+    end
+    cycles = readcyclecounter() - cc
+    cycles / (num_vectors(first(vs)) * num_iter)
+end
+
+# @generated function vapply!(f::F, y, x, ::Val{U}) where {F,U}
+#     quote
+#         @avx unroll=$U for j ∈ 1:1024
+#             y[j] = f(x[j])
+#         end
+#     end
+# end
+
+# vector_init(::Val{N}, ::Type{T}) where {N,T} = VectorizationBase.zero_vecunroll(StaticInt(N), pick_vector_width(T), T, VectorizationBase.register_size())
+# vector_init(::Val{1}, ::Type{T}) where {T} = VectorizationBase.vzero(pick_vector_width(T), T)
+
+# @generated function unrolltest(f::F, x::AbstractVector{T}, ::Val{U}) where {F,U,T}
+#     quote
+#         cc = readcyclecounter()
+#         for i ∈ 1:8192
+#             s = vector_init(Val{$U}(), $T)
+#             @avx unroll=$U for j ∈ 1:512
+#                 s += f(x[j])
+#             end
+#             volatile(s)
+#         end
+#         cycles = readcyclecounter() - cc
+#         pick_vector_width(T) * cycles / (512 * 8192)
+#     end
+# end
+
+
+# @generated function unrolltest!(f::F, y::AbstractVector{T}, x::AbstractVector{T}, ::Val{U}) where {F,U,T}
+#     quote
+#         cc = readcyclecounter()
+#         for i ∈ 1:8192
+#             @avx unroll=$U for j ∈ 1:512
+#                 y[j] = f(x[j])
+#             end
+#         end
+#         cycles = readcyclecounter() - cc
+#         pick_vector_width(T) * cycles / (512 * 8192)
+#     end
+# end
+
+let
+    vx = Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...);
+    vu2 = VectorizationBase.VecUnroll(ntuple(_ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), Val(2)));
+    vu4 = VectorizationBase.VecUnroll(ntuple(_ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), Val(4)));
+    vu8 = VectorizationBase.VecUnroll(ntuple(_ -> Vec(ntuple(_ -> 10randn(), pick_vector_width(Float64))...), Val(8)));
+    for binaryf ∈ [log, log2, log10, log1p, exp, exp2, exp10, expm1, sin, cos]
+        rt1 = unrolltest(f, vx)
+        rt2 = unrolltest(f, vu2)
+        rt4 = unrolltest(f, vu4)
+        rt8 = unrolltest(f, vu8)
+    end
+end
+