Fix cost 0 modeling resulting in bad splitting.

chriselrod · chriselrod · commit a232255f9302 · 2021-05-01T21:17:06.000-04:00
diff --git a/src/codegen/loopstartstopmanager.jl b/src/codegen/loopstartstopmanager.jl
@@ -85,7 +85,6 @@ function indices_calculated_by_pointer_offsets(ls::LoopSet, ar::ArrayReferenceMe
     gespinds = Expr(:tuple)
     out = Vector{Bool}(undef, length(indices))
     li = ar.loopedindex
-    # @show ls.vector_width
     for i ∈ eachindex(li)
         ii = i + offset
         ind = indices[ii]
@@ -249,7 +248,6 @@ function cse_constant_offsets!(
   ls::LoopSet, allarrayrefs::Vector{ArrayReferenceMeta}, allarrayrefsind::Int, name_to_array_map::Vector{Vector{Int}}, arrayref_to_name_op_collection::Vector{Vector{Tuple{Int,Int,Int}}}
 )
   ar = allarrayrefs[allarrayrefsind]
-  # @show ar
   # vptrar = vptr(ar)
   arrayref_to_name_op = arrayref_to_name_op_collection[allarrayrefsind]
   array_refs_with_same_name = name_to_array_map[first(first(arrayref_to_name_op))]
@@ -542,7 +540,6 @@ function use_loop_induct_var!(
   offsetprecalc_descript = Expr(:tuple)
   use_offsetprecalc = false
   vptrar = vptr(ar)
-  # @show ar
   Wisz = false#ls.vector_width == 0
   for (i,isli) ∈ enumerate(li)
     ii = i + offset
@@ -605,7 +602,6 @@ function add_loop_start_stop_manager!(ls::LoopSet)
       use_livs[i] = use_loop_induct_var!(ls, q, arrayrefs[i], arrayrefs, i, includeinlet[i])
       #name_to_array_map[first(first(unique_to_name_and_op_map[i]))], unique_to_name_and_op_map)
     end
-    # @show use_livs,
     # loops, sorted from outer-most to inner-most
     looporder = reversenames(ls)
     # For each loop, we need to choose an induction variable
@@ -633,7 +629,6 @@ function add_loop_start_stop_manager!(ls::LoopSet)
         terminators[nloops+1-i] = if (loopsym ∈ loopinductvars) || (any(r -> any(isequal(-i), r), use_livs)) || iszero(length(loopstartᵢ))
             0
         else
-            # @show i, loopsym loopdependencies.(operations(ls)) operations(ls)
             # @assert !iszero(length(loopstartᵢ))
             last(ric[argmin(first.(ric))]) # index corresponds to array ref's position in loopstart
         end
@@ -703,9 +698,7 @@ function pointermax_index(ls::LoopSet, ar::ArrayReferenceMeta, n::Int, sub::Int,
     loopsym = names(ls)[n]
     index = Expr(:tuple);
     ind = 0
-    # @show ar loopsym names(ls) n
     for (j,i) ∈ enumerate(getindicesonly(ar))
-        # @show j,i
         if i === loopsym
             ind = j
             if iszero(sub)
@@ -773,7 +766,6 @@ function append_pointer_maxes!(
             push!(loopstart.args, Expr(:(=), maxsym(vptr_ar, sub), pointermax(ls, ar, n, sub, isvectorized, stopindicator, incr)))
         end
     else
-        # @show n, getloop(ls, n) ar
         index, ind = pointermax_index(ls, ar, n, submax, isvectorized, stopindicator, incr)
         pointercompbase = maxsym(vptr_ar, submax)
         push!(loopstart.args, Expr(:(=), pointercompbase, Expr(:call, lv(:gesp), vptr_ar, index)))
@@ -839,7 +831,6 @@ function startloop(ls::LoopSet, us::UnrollSpecification, n::Int, submax = maxunr
       push!(loopstart.args, startloop(getloop(ls, loopsym), loopsym))
     else
         isvectorized = n == vloopnum
-        # @show ptrdefs
         append_pointer_maxes!(loopstart, ls, ptrdefs[termind], n, submax, isvectorized)
     end
     loopstart
@@ -891,7 +882,6 @@ function terminatecondition(ls::LoopSet, us::UnrollSpecification, n::Int, inclma
 
     termar = lssm.incrementedptrs[n][termind]
     ptr = vptr(termar)
-    # @show UF, isvectorized(us, n)
     if inclmask && isvectorized(us, n)
         Expr(:call, :<, ptr, maxsym(ptr, 0))
     else
diff --git a/src/codegen/lower_constant.jl b/src/codegen/lower_constant.jl
@@ -92,7 +92,6 @@ function getparentsreductzero(ls::LoopSet, op::Operation)::Float64
             return reduction_instruction_class(instruction(opp))
         end
     end
-    @show identifier(op)
     throw("Reduct zero not found for operation $(name(op)).")
 end
 vecbasefunc(f) = Expr(:(.), Expr(:(.), :LoopVectorization, QuoteNode(:VectorizationBase)), QuoteNode(f))
diff --git a/src/codegen/split_loops.jl b/src/codegen/split_loops.jl
@@ -102,7 +102,7 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
         ls_2 = split_loopset(ls, remaining_ops)
         order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost(ls_2)
         # U_1 = T_1 = U_2 = T_2 = 2
-        # @show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
+        #@show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
         if cost_1 + cost_2 ≤ cost_fused
             ls_2_lowered = if length(remaining_ops) > 1
                 inline = iszero(inline) ? (shouldinline_1 % Int) : inline
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -105,7 +105,6 @@ function shifted_loopset(ls::LoopSet, loopsyms::Vector{Symbol})
     end
     ld
 end
-# loopdeps_uint(ls::LoopSet, op::Operation) = (@show op; shifted_loopset(ls, loopdependencies(op)))
 loopdeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, loopdependencies(op))
 reduceddeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, reduceddependencies(op))
 childdeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, reducedchildren(op))
@@ -356,7 +355,6 @@ end
 # 2) decide whether to gesp that loopstart inside `add_grouped_strided_pointer`
 function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet)
   allarrayrefs, name_to_array_map, unique_to_name_and_op_map = uniquearrayrefs_csesummary(ls)
-  # @show allarrayrefs
   gsp = Expr(:call, lv(:grouped_strided_pointer))
   tgarrays = Expr(:tuple)
   # refs_to_gesp = ArrayReferenceMeta[]
@@ -371,7 +369,6 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet)
   #   ar = allarrayrefs[j]
   #   gespinds = cse_constant_offsets!(ls, allarrayrefs, j, array_refs_with_same_name, arrayref_to_name_op_collection)
   # end
-  # @show refs_aliasing_syms
   for (j,ref) ∈ enumerate(refs_aliasing_syms)
     vpref = vptr(ref)
     duplicate = false
@@ -381,7 +378,6 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet)
         break
       end
     end
-    # @show duplicate
     duplicate && continue
     duplicate_map[j] = (i += 1)
     found = false
diff --git a/src/modeling/determinestrategy.jl b/src/modeling/determinestrategy.jl
@@ -44,7 +44,6 @@ function unitstride(ls::LoopSet, op::Operation, s::Symbol)
     li = op.ref.loopedindex
     # The first index is allowed to be indexed by `s`
     fi = first(inds)
-    # @show (fi === DISCONTIGUOUS), (fi === CONSTANTZEROINDEX), (first(getstrides(op)) ≠ 1), unitstep(getloop(ls,s))
     if ((fi === DISCONTIGUOUS) | (fi === CONSTANTZEROINDEX)) || (first(getstrides(op)) ≠ 1) || !unitstep(getloop(ls,s))
         return false
     # elseif !first(li)
@@ -77,27 +76,25 @@ function cost(ls::LoopSet, op::Operation, (u₁,u₂)::Tuple{Symbol,Symbol}, vlo
         end
     elseif iscompute(op) &&
         Base.sym_in(instruction(op).instr, (:(+), :(-), :add_fast, :sub_fast)) &&
-        all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
+        all(opp -> (isloopvalue(opp)), parents(op))
+        # all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
         return 0.0, 0, 0.0
     end
     opisvectorized = isvectorized(op)
     srt, sl, srp = opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr)
     if accesses_memory(op)
         # either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter
         if opisvectorized
-            # @show unitstride(ls,op,vloopsym), srt,sl,srp
             if !unitstride(ls, op, vloopsym)# || !isdense(op) # need gather/scatter
                 indices = getindices(op)
                 contigind = first(indices)
-                # @show rejectinterleave(op) op
                 shifter = max(2,Wshift)
                 if rejectinterleave(op)
                     offset = 0.0 # gather/scatter, alignment doesn't matter
                 else
                     shifter -= 1
                     offset = 0.5reg_size(ls) / cache_lnsze(ls)
                 end
-                # @show shifter,offset, Wshift
                 if shifter > 1 &&
                     (!rejectcurly(op) && (((contigind === CONSTANTZEROINDEX) && ((length(indices) > 1) && (indices[2] === u₁) || (indices[2] === u₂))) ||
                     ((u₁ === contigind) | (u₂ === contigind))))
@@ -118,7 +115,6 @@ function cost(ls::LoopSet, op::Operation, (u₁,u₂)::Tuple{Symbol,Symbol}, vlo
                 #       this feature is common to all of them.
                 srt += 0.5reg_size(ls) / cache_lnsze(ls)
             end
-            # @show srt,sl,srp
         elseif isstore(op) # broadcast or reductionstore; if store we want to penalize reduction
             srt *= 3
             sl *= 3
@@ -184,7 +180,7 @@ function evaluate_cost_unroll(
             included_vars[id] && continue
             # it must also be a subset of defined symbols
             loopdependencies(op) ⊆ nested_loop_syms || continue
-            # hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf
+          # hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf
             rd = reduceddependencies(op)
             hasintersection(rd, @view(nested_loop_syms[1:end-length(rd)])) && return Inf
             if isstore(op) #TODO: DRY (this is repeated in evaluate_cost_tile)
@@ -194,7 +190,6 @@ function evaluate_cost_unroll(
                 end
             end
             included_vars[id] = true
-            # @show op, cost(ls, op, vloopsym, Wshift, size_T)
             # TODO: use actual unrolls here?
             c = first(cost(ls, op, (Symbol(""),Symbol("")), vloopsym, Wshift, size_T))
             total_cost += iter * c
@@ -213,16 +208,13 @@ function depchain_cost!(
     for opp ∈ parents(op)
         skip[identifier(opp)] && continue
         rt, sl = depchain_cost!(ls, skip, opp, unrolled, vloopsym, Wshift, size_T, rt, sl)
-        # @show rt,sl, opp
     end
     # Basically assuming memory and compute don't conflict, but everything else does
     # Ie, ignoring the fact that integer and floating point operations likely don't either
     if iscompute(op)
         rtᵢ, slᵢ = cost(ls, op, (unrolled,Symbol("")), vloopsym, Wshift, size_T)
-        # @show rtᵢ, slᵢ, op
         rt += rtᵢ; sl += slᵢ
     end
-    # @show rt, sl
     rt, sl
 end
 function parentsnotreduction(op::Operation)
@@ -280,7 +272,6 @@ function unroll_no_reductions(ls, order, vloopsym)
         max(1, min(4, round(Int, 2compute_rt / load_rt)))
     end
     # u = min(u, max(1, (reg_count(ls) ÷ max(1,round(Int,rp)))))
-    # @show u
     # commented out here is to decide to align loops
     # if memory_rt > compute_rt && isone(u) && (length(order) > 1) && (last(order) === vloopsym) && length(getloop(ls, last(order))) > 8W
     #     ls.align_loops[] = findfirst(operations(ls)) do op
@@ -333,7 +324,6 @@ function determine_unroll_factor(
         load_recip_throughput,
         store_recip_throughput
     )
-    # @show recip_throughput, latency
     recip_throughput, latency
 end
 function count_reductions(ls::LoopSet)
@@ -393,9 +383,9 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::S
         end
     end
     # min(8, roundpow2(max(1, round(Int, latency / (rt * num_reductions) ) ))), best_unrolled
-    UF = min(8, VectorizationBase.nextpow2(max(1, round(Int, latency / (rt * num_reductions) ) )))
+    UF = VectorizationBase.nextpow2(round(Int, clamp(latency / (rt * num_reductions), 1.0, 8.0)))
     if UF == 1 && num_reductions > 1
-        UF = min(8, VectorizationBase.nextpow2(max(1, round(Int, latency / (rt * cld(num_reductions, 2)) ) )))
+        UF = VectorizationBase.nextpow2(round(Int, clamp(latency / (rt * cld(num_reductions, 2)), 1.0, 8.0)))
     end
     if best_unrolled === vloopsym
         UF = demote_unroll_factor(ls, UF, vloopsym)
@@ -406,8 +396,6 @@ end
 function unroll_cost(X, u₁, u₂, u₁L, u₂L)
     u₂factor = (num_iterations(u₂L, u₂)/u₂L)
     u₁factor = (num_iterations(u₁L, u₁)/u₁L)
-    # @show num_iterations(u₂L, u₂)/u₂L, u₂, u₂L
-    # @show num_iterations(u₁L, u₁)/u₁L, u₁, u₁L
     # X[1]*u₂factor*u₁factor + X[4] + X[2] * u₂factor + X[3] * u₁factor
     X[1] + X[2] * u₂factor + X[3] * u₁factor + X[4] * u₁factor * u₂factor
 end
@@ -433,8 +421,6 @@ function solve_unroll_iter(X, R, u₁L, u₂L, u₁range, u₂range)
         for u₂temp ∈ u₂range
             RR ≥ u₁temp*u₂temp*R₁ + u₁temp*R₂ + u₂temp*R₃ || continue
             tempcost = unroll_cost(X, u₁temp, u₂temp, u₁L, u₂L)
-            # @show u₁temp, u₂temp, tempcost
-            # @show u₁temp*u₂temp*R₁ + u₁temp*R₂ + u₂temp*R₃
             if tempcost ≤ bestcost
                 bestcost = tempcost
                 u₁best, u₂best = u₁temp, u₂temp
@@ -455,10 +441,8 @@ function solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step::Int, u₂step::Int,
     c = X₃*RR^2
     discriminant = b^2 - 4a*c
     discriminant < 0 && return -1,-1,Inf
-    # @show R₁, R₂, R₃, R₄
     u₁float = max((sqrt(discriminant) + b) / (-2a), float(u₁step)) # must be at least 1
     u₂float = (RR - u₁float*R₂)/(u₁float*R₁)
-    # @show u₁float, u₂float
     if !(isfinite(u₂float) & isfinite(u₁float)) # brute force
         u₁low = u₂low = 1
         u₁high = iszero(X₂) ? 2 : (atleast32registers ? 8 : 6)
@@ -611,7 +595,6 @@ function solve_unroll(
     else
         u₂Lf = Float64(u₂L)
     end
-    # @show u₁Lf, u₂Lf, u₁L, length(u₁loop)
     u₁, u₂, cost = solve_unroll(cost_vec, reg_pressure, maxu₁, maxu₂, u₁Lf, u₂Lf, u₁step, u₂step, atleast32registers)
     # heuristic to more evenly divide small numbers of iterations
     if isstaticloop(u₂loop)
@@ -635,7 +618,6 @@ function loopdepindices(ls::LoopSet, op::Operation)
     isdiscontig = first(loopdeps) === DISCONTIGUOUS
     # isdiscontig = isdiscontiguous(op.ref)
     loopedindex = op.ref.loopedindex
-    # @show loopdeps
     if !isdiscontig && all(loopedindex) && !(any(==(CONSTANTZEROINDEX), loopdeps))
         return loopdeps
     end
@@ -654,7 +636,6 @@ function loopdepindices(ls::LoopSet, op::Operation)
 end
 function stride_penalty(ls::LoopSet, op::Operation, order::Vector{Symbol}, loopfreqs)
     loopdeps = loopdepindices(ls, op)
-    # @show op loopdeps
     opstrides = Vector{Int}(undef, length(loopdeps))
     # very minor stride assumption here, because we don't really want to base optimization decisions on it...
     opstrides[1] = 1.0 + (first(loopdependencies(op.ref)) === DISCONTIGUOUS) + (first(loopdependencies(op.ref)) === CONSTANTZEROINDEX)
@@ -815,10 +796,8 @@ function load_elimination_cost_factor!(
     cost_vec, reg_pressure, choose_to_inline, ls::LoopSet, op::Operation, iters, unrollsyms::UnrollSymbols, Wshift, size_T
 )
     @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
-    # @show isoptranslation(ls, op, unrollsyms)
     if !iszero(first(isoptranslation(ls, op, unrollsyms)))
         rt, lat, rp = cost(ls, op, (u₁loopsym, u₂loopsym), vloopsym, Wshift, size_T)
-        # @show rt
         rto = rt
         rt *= iters
             # rt *= factor1; rp *= factor2;
@@ -1086,7 +1065,6 @@ function evaluate_cost_tile!(
         if isstore(op) & (!u₁reducesrt) & (!u₂reducesrt)
             irreducible_storecosts += rt
         end
-        # iiter = convert(Int, iters[id]); @show u₁reducesrt, u₂reducesrt, op, rt, rto, rp, iiter
         update_cost_vec!(cost_vec, rt, u₁reducesrt, u₂reducesrt)
         update_reg_pres!(reg_pressure, rp, u₁reducesrp, u₂reducesrp)
         # update_costs!(reg_pressure, rp, u₁reducesrp, u₂reducesrp)
@@ -1104,7 +1082,6 @@ function evaluate_cost_tile!(
     else
         0
     end
-    # @show (irreducible_storecosts / sum(cost_vec))
     if (irreducible_storecosts / sum(cost_vec) ≥ 0.5) && !any(op -> loadintostore(ls, op), operations(ls))
         u₁, u₂ = if visbit
             vecsforbyte = 8 ÷ ls.vector_width
diff --git a/src/modeling/graphs.jl b/src/modeling/graphs.jl
@@ -204,7 +204,6 @@ staticmulincr(ptr, incr) = Expr(:call, lv(:staticmul), Expr(:call, :eltype, ptr)
 @inline cmpend(i::Int, r::CloseOpen) = i < getfield(r,:upper)
 @inline cmpend(i::Int, r::AbstractUnitRange) = i ≤ last(r)
 @inline cmpend(i::Int, r::AbstractRange) = i ≤ last(r)
-# @inline cmpend(i::Int, r::AbstractRange) = @show i last(r) i ≤ last(r)
 # @inline cmpend(i::Int, r::AbstractRange) = i ≤ vsub_fast(last(r), step(r))
 
 @inline vcmpend(i::Int, r::CloseOpen, ::StaticInt{W}) where {W} = i ≤ vsub_fast(getfield(r,:upper), W)
@@ -213,7 +212,6 @@ staticmulincr(ptr, incr) = Expr(:call, lv(:staticmul), Expr(:call, :eltype, ptr)
 # i += 4*3 # i = 12
 @inline vcmpend(i::Int, r::AbstractRange, ::StaticInt{W}) where {W} = i ≤ vsub_fast(last(r), vsub_fast(W*step(r), 1))
 # @inline vcmpend(i::Int, r::AbstractRange, ::StaticInt{W}) where {W} = i ≤ vsub_fast(last(r), W*step(r))
-# @inline vcmpend(i::Int, r::AbstractRange, ::StaticInt{W}) where {W} = @show i m = vsub_fast(last(r), W*step(r)) i ≤ m
 # @inline vcmpend(i::Int, r::AbstractRange, ::StaticInt{W}) where {W} = i ≤ vsub_fast(last(r), W)
 
 function staticloopexpr(loop::Loop)
@@ -664,20 +662,12 @@ end
 names(ls::LoopSet) = ls.loop_order.loopnames
 reversenames(ls::LoopSet) = ls.loop_order.bestorder
 function getloopid_or_nothing(ls::LoopSet, s::Symbol)
-    # @show ls.loopsymbols, s
     for (loopnum,sym) ∈ enumerate(ls.loopsymbols)
         s === sym && return loopnum
     end
 end
 
 getloopid(ls::LoopSet, s::Symbol) = getloopid_or_nothing(ls, s)::Int
-# function getloopid(ls::LoopSet, s::Symbol)::Int
-#     @show ls.loops
-#     id = getloopid_or_nothing(ls, s)
-#     @show id
-#     id
-# end
-# getloop(ls::LoopSet, i::Integer) = getloop(ls, names(ls)[i])
 getloop(ls::LoopSet, i::Integer) = ls.loops[ls.loopordermap[i]] # takes nest level after reordering
 getloop_from_id(ls::LoopSet, i::Integer) = ls.loops[i] # takes w/ respect to original loop order.
 getloop(ls::LoopSet, s::Symbol) = getloop_from_id(ls, getloopid(ls, s))
@@ -1270,7 +1260,6 @@ function fill_offset_memop_collection!(ls::LoopSet)
             else
                 isstore(opp) || continue
             end
-            # @show op opp
             oppref = opp.ref.ref
             sameref(opref, oppref) || continue
             if collectionsize == 0
@@ -1306,13 +1295,11 @@ function fill_offset_memop_collection!(ls::LoopSet)
             for j ∈ 1:num_unroll_collections
                 collectionⱼ = unroll_collections[j]
                 # giet id (`first`) of first item in collection to get base offsets for comparison
-                # @show op, opid ops[opidc[first(first(collectionⱼ))], first(first(collectionⱼ))
                 if view(getoffsets(ops[opidc[first(first(collectionⱼ))]]), r) == v
                     found_match = true
                     push!(collectionⱼ, (i, o))
                 end
             end
-            # @show opid, found_match
             if !found_match
                 num_unroll_collections += 1 # the `i` points to position within `opidc`
                 unroll_collections[num_unroll_collections] = [(i,o)]
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
@@ -672,7 +672,7 @@ Execute an `@avx` block. The block's code is represented via the arguments:
 @aggressive_constprop @generated function _avx_!(
     ::Val{var"#UNROLL#"}, ::Val{var"#OPS#"}, ::Val{var"#ARF#"}, ::Val{var"#AM#"}, ::Val{var"#LPSYM#"}, var"#lv#tuple#args#"::Tuple{var"#LB#",var"#V#"}
 ) where {var"#UNROLL#", var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#", var"#LB#", var"#V#"}
-    # 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
+    1 + 1 # Irrelevant line you can comment out/in to force recompilation...
     ls = _avx_loopset(var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#", var"#LB#".parameters, var"#V#".parameters, var"#UNROLL#")
     # return @show avx_body(ls, var"#UNROLL#")
     if last(var"#UNROLL#") > 1
diff --git a/test/copy.jl b/test/copy.jl
@@ -155,7 +155,7 @@ using LoopVectorization, OffsetArrays, Test
         @test x == q2
         fill!(q2, -999999); @avx q2 .= x;
         @test x == q2
-        @test all(iszero, issue_256!(x))
+        @test all(iszero, issue_256!(reshape(x,(length(x),1))))
 
         B = rand(R, 79, 83);
         A1 = zeros(T, 79, 85);