A fix for lowering when an outerreduction isn't unrolled, and changes to determinestrategy.jl that should (hopefully) improve performance more often than not.

chriselrod · chriselrod · commit 272c856f7b65 · 2020-06-21T06:01:52.000-04:00
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -157,17 +157,23 @@ end
 function loopset_return_value(ls::LoopSet, ::Val{extract}) where {extract}
     @assert !iszero(length(ls.outer_reductions))
     if isone(length(ls.outer_reductions))
+        op = getop(ls, ls.outer_reductions[1])
         if extract
-            Expr(:call, :extract_data, Symbol(mangledvar(getop(ls, ls.outer_reductions[1])), 0))
+            if (isu₁unrolled(op) | isu₂unrolled(op))
+                Expr(:call, :extract_data, Symbol(mangledvar(op), 0))
+            else
+                Expr(:call, :extract_data, mangledvar(op))
+            end
         else
-            Symbol(mangledvar(getop(ls, ls.outer_reductions[1])), 0)
+            Symbol(mangledvar(op), 0)
         end
     else#if length(ls.outer_reductions) > 1
         ret = Expr(:tuple)
         ops = operations(ls)
         for or ∈ ls.outer_reductions
+            op = ops[or]
             if extract
-                push!(ret.args, Expr(:call, :extract_data, Symbol(mangledvar(ops[or]), 0)))
+                push!(ret.args, Expr(:call, :extract_data, Symbol(mangledvar(op), 0)))
             else
                 push!(ret.args, Symbol(mangledvar(ops[or]), 0))
             end
diff --git a/src/determinestrategy.jl b/src/determinestrategy.jl
@@ -67,7 +67,6 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
     srt, sl, srp = opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr)
     if accesses_memory(op)
         # either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter
-        # @show instr, vectorized, loopdependencies(op), unitstride(op, vectorized)
         if opisvectorized
             if !unitstride(ls, op, vectorized)# || !isdense(op) # need gather/scatter
                 r = (1 << Wshift)
@@ -131,12 +130,11 @@ function evaluate_cost_unroll(
             rd = reduceddependencies(op)
             hasintersection(rd, @view(nested_loop_syms[1:end-length(rd)])) && return Inf
             included_vars[id] = true
-            # @show op first(cost(op, vectorized, Wshift, size_T)), iter
             total_cost += iter * first(cost(ls, op, vectorized, Wshift, size_T))
             total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
         end
     end
-    total_cost + stride_penalty(ls, order)
+    total_cost + stride_penalty(ls, order) - 1.0 # -1.0 to place finger on scale in its favor
 end
 
 # only covers vectorized ops; everything else considered lifted?
@@ -163,13 +161,16 @@ function parentsnotreduction(op::Operation)
     end
     return true
 end
-function roundpow2(i::Integer)
-    u = VectorizationBase.nextpow2(i)
-    l = u >>> 1
-    ud = u - i
-    ld = i - l
-    ud > ld ? l : u
-end
+# function roundpow2(i::Integer)
+#     u = VectorizationBase.nextpow2(i)
+#     l = u >>> 1
+#     ud = u - i
+#     ld = i - l
+#     ud > ld ? l : u
+# end
+# function roundpow2(x::Float64)
+    # 1 << round(Int, log2(x))
+# end
 function unroll_no_reductions(ls, order, vectorized)
     size_T = biggest_type_size(ls)
     W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vectorized), size_T)::Tuple{Int,Int}
@@ -190,10 +191,10 @@ function unroll_no_reductions(ls, order, vectorized)
         end
     end
     # heuristic guess
-    # @show compute_rt, load_rt
     # roundpow2(min(4, round(Int, (compute_rt + load_rt + 1) / compute_rt)))
     rt = max(compute_rt, load_rt)
-    (iszero(rt) ? 4 : max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled
+    # (iszero(rt) ? 4 : max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled
+    (iszero(rt) ? 4 : max(1, VectorizationBase.nextpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled
 end
 function determine_unroll_factor(
     ls::LoopSet, order::Vector{Symbol}, unrolled::Symbol, vectorized::Symbol
@@ -204,17 +205,24 @@ function determine_unroll_factor(
     # So if num_reductions > 0, we set the unroll factor to be high enough so that the CPU can be kept busy
     # if there are, U = max(1, round(Int, max(latency) * throughput / num_reductions)) = max(1, round(Int, latency / (recip_throughput * num_reductions)))
     # We also make sure register pressure is not too high.
-    latency = 0
+    latency = 1
+    # compute_recip_throughput_u = 0.0
     compute_recip_throughput = 0.0
     visited_nodes = fill(false, length(operations(ls)))
     load_recip_throughput = 0.0
     store_recip_throughput = 0.0
     for op ∈ operations(ls)
-        dependson(op, unrolled) || continue
+        # dependson(op, unrolled) || continue
         if isreduction(op)
             rt, sl = depchain_cost!(ls, visited_nodes, op, vectorized, Wshift, size_T)
-            latency = max(sl, latency)
+            if isouterreduction(op) != -1 || unrolled ∉ reduceddependencies(op)
+                latency = max(sl, latency)
+            end
+            # if unrolled ∈ loopdependencies(op)
+            #     compute_recip_throughput_u += rt
+            # else
             compute_recip_throughput += rt
+            # end
         elseif isload(op)
             load_recip_throughput += first(cost(ls, op, vectorized, Wshift, size_T))
         elseif isstore(op)
@@ -247,19 +255,20 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vectorized:
         # if more than 1 loop, there is some cost. Picking 2 here as a heuristic.
         return unroll_no_reductions(ls, order, vectorized)
     end
-
+    innermost_loop = last(order)
     rt = Inf; rtcomp = Inf; latency = Inf; best_unrolled = Symbol("")
     for unrolled ∈ order
         rttemp, ltemp = determine_unroll_factor(ls, order, unrolled, vectorized)
-        rtcomptemp = rttemp + (0.01 * (vectorized === unrolled))
+        rtcomptemp = rttemp + (0.01 * ((vectorized === unrolled) + (unrolled === innermost_loop) - latency))
         if rtcomptemp < rtcomp
             rt = rttemp
             rtcomp = rtcomptemp
             latency = ltemp
             best_unrolled = unrolled
         end
     end
-    min(8, roundpow2(max(1, round(Int, latency / (rt * num_reductions) ) ))), best_unrolled
+    # min(8, roundpow2(max(1, round(Int, latency / (rt * num_reductions) ) ))), best_unrolled
+    min(8, VectorizationBase.nextpow2(max(1, round(Int, latency / (rt * num_reductions) ) ))), best_unrolled
 end
 
 function unroll_cost(X, u₁, u₂, u₁L, u₂L)
@@ -273,7 +282,6 @@ end
 #     u₁b = 1; u₂b = 1
 #     for u₁ ∈ 1:4, u₂ ∈ 1:4
 #         c = unroll_cost(X, u₁, u₂, u₁L, u₂L)
-#         @show u₁, u₂, c
 #         if cb > c
 #             cb = c
 #             u₁b = u₁; u₂b = u₂
@@ -679,7 +687,6 @@ function evaluate_cost_tile(
     # cost_mat[2] / ( u₂loopsym)
     # cost_mat[3] / ( unrolled)
     # cost_mat[4]
-    # @show order
     cost_vec = cost_vec_buf(ls)
     reg_pressure = reg_pres_buf(ls)
     # @inbounds reg_pressure[2] = 1
@@ -708,8 +715,6 @@ function evaluate_cost_tile(
             included_vars[id] && continue
             # it must also be a subset of defined symbols
             all(ld -> ld ∈ nested_loop_syms, loopdependencies(op)) || continue
-            # # @show nested_loop_syms
-            # # @show reduceddependencies(op)
             rd = reduceddependencies(op)
             hasintersection(rd, @view(nested_loop_syms[1:end-length(rd)])) && return 0,0,Inf,false
             included_vars[id] = true
@@ -720,7 +725,6 @@ function evaluate_cost_tile(
             # reduced_by_unrolling[2,id] = (u₂reached | depends_on_u₁) & !depends_on_u₂
             reduced_by_unrolling[1,id] = (u₁reached) & !depends_on_u₁
             reduced_by_unrolling[2,id] = (u₂reached) & !depends_on_u₂
-            # @show op iter, unrolledu₂loopsym[:,id]
             iters[id] = iter
             innerloop ∈ loopdependencies(op) && set_upstream_family!(descendentsininnerloop, op, true)
         end
@@ -730,7 +734,6 @@ function evaluate_cost_tile(
         opisininnerloop = descendentsininnerloop[id]
         
         u₁reduces, u₂reduces = reduced_by_unrolling[1,id], reduced_by_unrolling[2,id]
-        # @show op, u₁reduces, u₂reduces
         if isload(op)
             if add_constant_offset_load_elmination_cost!(cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, u₁reduces, u₂reduces, Wshift, size_T, opisininnerloop)
                 continue
@@ -743,34 +746,26 @@ function evaluate_cost_tile(
             rt += 0.5VectorizationBase.REGISTER_SIZE / VectorizationBase.CACHELINE_SIZE
             prefetch_good_idea = true
         end
-        # @show isunrolled₁, isunrolled₂, op rt, lat, rp
         rp = (opisininnerloop && !(loadintostore(ls, op))) ? rp : zero(rp) # we only care about register pressure within the inner most loop
         # rp = opisininnerloop ? rp : zero(rp) # we only care about register pressure within the inner most loop
         rto = rt
         rt *= iters[id]
         if u₁reduces & u₂reduces
-            # @show op 4, rto, iters[id], lat, rp
             cost_vec[4] += rt
             reg_pressure[4] += rp
         elseif u₂reduces # cost decreased by unrolling u₂loop
-            # @show op 2, rto, iters[id], lat, rp
             cost_vec[2] += rt
             reg_pressure[2] += rp
         elseif u₁reduces # cost decreased by unrolling u₁loop
-            # @show op 3, rto, iters[id], lat, rp
             cost_vec[3] += rt
             reg_pressure[3] += rp
         else # no cost decrease; cost must be repeated
-            # @show op 1, rto, iters[id], lat, rp
             cost_vec[1] += rt
             reg_pressure[1] += rp
         end
     end
     # @inbounds ((cost_vec[4] > 0) || ((cost_vec[2] > 0) & (cost_vec[3] > 0))) || return 0,0,Inf,false
-    # @show cost_vec reg_pressure
     costpenalty = (sum(reg_pressure) > REGISTER_COUNT) ? 2 : 1
-    # @show order, vectorized cost_vec reg_pressure
-    # @show solve_unroll(ls, u₁loopsym, u₂loopsym, cost_vec, reg_pressure)
     u₁v = vectorized === u₁loopsym; u₂v = vectorized === u₂loopsym
     round_uᵢ = prefetch_good_idea ? (u₁v ? 1 : (u₂v ? 2 : 0)) : 0
     u₁, u₂, ucost = solve_unroll(ls, u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vectorized, round_uᵢ)
@@ -820,7 +815,6 @@ end
 # that I could come up with.
 function Base.iterate(lo::LoopOrders, state)
     advance_state!(state) || return nothing
-    # # @show state
     syms = copyto!(lo.buff, lo.syms)
     for i ∈ eachindex(state)
         sᵢ = state[i]
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -703,3 +703,11 @@ function UnrollSpecification(ls::LoopSet, u₁loop::Symbol, u₂loop::Symbol, ve
     nv = findfirst(isequal(vectorized), order)::Int
     UnrollSpecification(nu₁, nu₂, nv, u₁, u₂)
 end
+
+# function getunrolled(ls::LoopSet)
+#     order = names(ls)
+#     us = ls.unrollspecification[]
+#     @unpack u₁loopnum, u₂loopnum = us
+#     order[u₁loopnum], order[u₂loopnum]
+# end
+
diff --git a/src/lowering.jl b/src/lowering.jl
@@ -440,13 +440,21 @@ function add_upper_outer_reductions(ls::LoopSet, loopq::Expr, Ulow::Int, Uhigh::
     Expr(:if, ncomparison, ifq)
 end
 function reduce_expr!(q::Expr, ls::LoopSet, U::Int)
+    us = ls.unrollspecification[]
+    # u₁loop, u₂loop = getunrolled(ls)
     for or ∈ ls.outer_reductions
         op = ls.operations[or]
         var = name(op)
         mvar = mangledvar(op)
         instr = instruction(op)
         reduce_expr!(q, mvar, instr, U)
-        length(ls.opdict) == 0 || push!(q.args, Expr(:(=), var, Expr(:call, lv(reduction_scalar_combine(instr)), var, Symbol(mvar, 0))))
+        if !iszero(length(ls.opdict))
+            if (isu₁unrolled(op) | isu₂unrolled(op))
+                push!(q.args, Expr(:(=), var, Expr(:call, lv(reduction_scalar_combine(instr)), var, Symbol(mvar, 0))))
+            else
+                push!(q.args, Expr(:(=), var, mvar))
+            end
+        end
     end
 end
 function gc_preserve(ls::LoopSet, q::Expr)
diff --git a/test/gemv.jl b/test/gemv.jl
@@ -3,7 +3,7 @@ using Test
 # T = Float32
 @testset "GEMV" begin
     # Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (3, 4) : (4, 6)
-    Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (2, 6) : (2, 10)
+    Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (1, 6) : (1, 10)
     gemvq = :(for i ∈ eachindex(y)
               yᵢ = 0.0
               for j ∈ eachindex(x)
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
@@ -5,7 +5,7 @@ using Test
 @testset "Miscellaneous" begin
 
     # Unum, Tnum = LoopVectorization.VectorizationBase.REGISTER_COUNT == 16 ? (3, 4) : (4, 4)
-    Unum, Tnum = LoopVectorization.REGISTER_COUNT == 16 ? (2, 6) : (2, 10)
+    Unum, Tnum = LoopVectorization.REGISTER_COUNT == 16 ? (1, 6) : (1, 10)
     dot3q = :(for m ∈ 1:M, n ∈ 1:N
               s += x[m] * A[m,n] * y[n]
               end);
@@ -69,9 +69,12 @@ using Test
                 B[j,i] = A[j,i] - x[j]
                 end)
     lssubcol = LoopVectorization.LoopSet(subcolq);
-    if LoopVectorization.REGISTER_COUNT != 8
-        @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, Unum, Tnum)
-    end
+    # if LoopVectorization.REGISTER_COUNT != 8
+    #     # @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, Unum, Tnum)
+    #     @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, 1, 1)
+    # end
+    @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :i, Symbol("##undefined##"), :j, 4, -1)
+    # @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
     ## @avx is SLOWER!!!!
     ## need to fix!
     function mysubcol!(B, A, x)
@@ -96,9 +99,11 @@ using Test
                 x[j] += A[j,i] - 0.25
                 end)
     lscolsum = LoopVectorization.LoopSet(colsumq);
-    if LoopVectorization.REGISTER_COUNT != 8
-        @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, Unum, Tnum)
-    end
+    # if LoopVectorization.REGISTER_COUNT != 8
+    #     # @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, Unum, Tnum)
+    #     @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, :i, :j, 1, 1)
+    # end
+    @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 8, -1)
     # my colsum is wrong (by 0.25), but slightly more interesting
     function mycolsum!(x, A)
         @. x = 0
@@ -133,11 +138,13 @@ using Test
              end)
     lsvar = LoopVectorization.LoopSet(varq);
     # LoopVectorization.choose_order(lsvar)
-    if LoopVectorization.REGISTER_COUNT == 32
-        @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, :i, :j, 2, 10)
-    elseif LoopVectorization.REGISTER_COUNT == 16
-        @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, :i, :j, 2, 6)
-    end
+    # @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, :i, :j, Unum, Tnum)
+    @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 8, -1)
+    # if LoopVectorization.REGISTER_COUNT == 32
+    #     @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, :i, :j, 2, 10)
+    # elseif LoopVectorization.REGISTER_COUNT == 16
+    #     @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, :i, :j, 2, 6)
+    # end
     
     function myvar!(s², A, x̄)
         @. s² = 0
@@ -686,8 +693,8 @@ using Test
         basis = rand(r, (dim, nbasis));
         coeffs = rand(T, nbasis);
         P = rand(T, dim, maxdeg+1);
-        mvp(P, basis, coeffs)
-        mvpavx(P, basis, coeffs)
+        # mvp(P, basis, coeffs)
+        # mvpavx(P, basis, coeffs)
         mvpv = mvp(P, basis, coeffs)
         @test mvpv ≈ mvpavx(P, basis, coeffs)
         if VERSION > v"1.1"