Instead of the hack of increasing W to 8 when any array has bits, instead when a vectorized axis loads/stores bits

chriselrod · chriselrod · commit 49d5ae964c69 · 2021-03-13T18:05:11.000-06:00
diff --git a/Project.toml b/Project.toml
@@ -28,7 +28,7 @@ SLEEFPirates = "0.6.12"
 Static = "0.2"
 ThreadingUtilities = "0.3"
 UnPack = "1"
-VectorizationBase = "0.19.5"
+VectorizationBase = "0.19.6"
 julia = "1.5"
 
 [extras]
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -399,7 +399,7 @@ function check_device(x)
     @info """`LoopVectorization.check_args` returned `false`, because `ArrayInterface.device(::$(typeof(x))) == $x`
         `LoopVectorization` normally requires `ArrayInterface.CPUPointer` (exceptions include ranges, `BitVector`s, and
         `BitArray`s whose number of rows is a multiple of 8). Therefore compiling a probably slow `@inbounds @fastmath` fallback loop.""" maxlog=1
-    false    
+    false
 end
 
 function check_args_call(ls::LoopSet)
diff --git a/src/modeling/determinestrategy.jl b/src/modeling/determinestrategy.jl
@@ -125,12 +125,6 @@ end
 function biggest_type_size(ls::LoopSet)
     maximum(elsize, operations(ls))
 end
-# function VectorizationBase.pick_vector_width(ls::LoopSet, u::Symbol)
-#     VectorizationBase.pick_vector_width(length(ls, u), biggest_type_size(ls))
-# end
-# function VectorizationBase.pick_vector_width_shift(ls::LoopSet, u::Symbol)
-#     VectorizationBase.pick_vector_width_shift(length(ls, u), biggest_type_size(ls))
-# end
 function hasintersection(a, b)
     for aᵢ ∈ a, bᵢ ∈ b
         aᵢ === bᵢ && return true
@@ -242,7 +236,6 @@ end
 function unroll_no_reductions(ls, order, vloopsym)
     size_T = biggest_type_size(ls)
     W, Wshift = lsvecwidthshift(ls, vloopsym, size_T)
-    # W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vloopsym), size_T)::Tuple{Int,Int}
 
     compute_rt = load_rt = store_rt = 0.0
     unrolled = last(order)
@@ -361,11 +354,25 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::S
     num_reductions = count_reductions(ls)
     # The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
     # The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
-    if iszero(num_reductions)
-        # if only 1 loop, no need to unroll
-        # if more than 1 loop, there is some cost. Picking 2 here as a heuristic.
-        return unroll_no_reductions(ls, order, vloopsym)
+    loopindexesbit = ls.loopindexesbit
+    if iszero(length(loopindexesbit)) || ((!loopindexesbit[getloopid(ls, vloopsym)]))
+        if iszero(num_reductions)
+            return unroll_no_reductions(ls, order, vloopsym)
+        else
+            return determine_unroll_factor(ls, order, vloopsym, num_reductions)
+        end
+    elseif iszero(num_reductions)
+        return 8 ÷ ls.vector_width[], vloopsym
+    else
+        rttemp, ltemp = determine_unroll_factor(ls, order, vloopsym, vloopsym)
+        UF = min(8, VectorizationBase.nextpow2(max(1, round(Int, ltemp / (rttemp * num_reductions) ) )))
+        UFfactor = 8 ÷ ls.vector_width[]
+        cld(UF, UFfactor)*UFfactor, vloopsym
     end
+end
+# function scale_unrolled()
+# end
+function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::Symbol, num_reductions::Int)
     innermost_loop = last(order)
     rt = Inf; rtcomp = Inf; latency = Inf; best_unrolled = Symbol("")
     for unrolled ∈ order
@@ -533,12 +540,17 @@ function solve_unroll(
     W::Int, vloopsym::Symbol, rounduᵢ::Int
 )
     (u₁step, u₂step) = if rounduᵢ == 1 # max is to safeguard against some weird arch I've never heard of.
-        (max(1,cache_lnsze(ls) ÷ reg_size(ls)), 1)
+        (max(1, cache_lnsze(ls) ÷ reg_size(ls)), 1)
     elseif rounduᵢ == 2
         (1, max(1,cache_lnsze(ls) ÷ reg_size(ls)))
+    elseif rounduᵢ == -1
+        (8 ÷ ls.vector_width[], 1)
+    elseif rounduᵢ == -2
+        (1, 8 ÷ ls.vector_width[])
     else
         (1, 1)
     end
+    # @show u₁step, u₂step
     u₁loop = getloop(ls, u₁loopsym)
     u₂loop = getloop(ls, u₂loopsym)
     solve_unroll(
@@ -921,7 +933,7 @@ end
 # But optimal order within tile must still be determined
 # as well as size of the tiles.
 function evaluate_cost_tile(
-    ls::LoopSet, order::Vector{Symbol}, unrollsyms::UnrollSymbols
+    ls::LoopSet, order::Vector{Symbol}, unrollsyms::UnrollSymbols, anyisbit::Bool
 )
     N = length(order)
     @assert N ≥ 2 "Cannot tile merely $N loops!"
@@ -940,7 +952,6 @@ function evaluate_cost_tile(
     # Need to check if fusion is possible
     size_T = biggest_type_size(ls)
     W, Wshift = lsvecwidthshift(ls, vloopsym, size_T)
-    # W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vloopsym), size_T)::Tuple{Int,Int}
     # costs =
     # cost_mat[1] / ( unrolled * u₂loopsym)
     # cost_mat[2] / ( u₂loopsym)
@@ -1019,10 +1030,8 @@ function evaluate_cost_tile(
         #elseif isconstant(op)
         end
         rt, lat, rp = cost(ls, op, (u₁loopsym, u₂loopsym), vloopsym, Wshift, size_T)
-        if isload(op)
-            if !prefetch_good_idea
-                prefetch_good_idea = prefetchisagoodidea(ls, op, UnrollArgs(ls, 4, unrollsyms, 4, 0)) ≠ 0
-            end
+        if isload(op) & (!prefetch_good_idea)
+            prefetch_good_idea = prefetchisagoodidea(ls, op, UnrollArgs(ls, 4, unrollsyms, 4, 0)) ≠ 0
         end
         # rp = (opisininnerloop && !(loadintostore(ls, op))) ? rp : zero(rp) # we only care about register pressure within the inner most loop
         rp = opisininnerloop ? rp : zero(rp) # we only care about register pressure within the inner most loop
@@ -1041,10 +1050,22 @@ function evaluate_cost_tile(
     # reg_pres[4] == remaining_registers
     costpenalty = ((reg_pressure[1] + reg_pressure[2] + reg_pressure[3]) > reg_pressure[4]) ? 2 : 1
     u₁v = vloopsym === u₁loopsym; u₂v = vloopsym === u₂loopsym
-    round_uᵢ = prefetch_good_idea ? (u₁v ? 1 : (u₂v ? 2 : 0)) : 0
+    visbit = anyisbit && ls.loopindexesbit[getloopid(ls,vloopsym)]
+    round_uᵢ = if visbit
+        (u₁v ? -1 : (u₂v ? -2 : 0))
+    elseif prefetch_good_idea
+        (u₁v ? 1 : (u₂v ? 2 : 0))
+    else
+        0
+    end
     # @show (irreducible_storecosts / sum(cost_vec))
     if (irreducible_storecosts / sum(cost_vec) ≥ 0.5) && !any(op -> loadintostore(ls, op), operations(ls))
-        u₁, u₂ = (1, 1)
+        u₁, u₂ = if visbit
+            vecsforbyte = 8 ÷ ls.vector_width[]
+            u₁v ? (vecsforbyte,1) : (1,vecsforbyte)
+        else
+            (1, 1)
+        end
         ucost = unroll_cost(cost_vec, 1, 1, length(getloop(ls, u₁loopsym)), length(getloop(ls, u₂loopsym)))
     else
         u₁, u₂, ucost = solve_unroll(ls, u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vloopsym, round_uᵢ)
@@ -1198,6 +1219,7 @@ function choose_tile(ls::LoopSet)
     best_order = copyto!(ls.loop_order.bestorder, lo.syms)
     bestu₁ = bestu₂ = best_vec = first(best_order) # filler
     u₁ = u₂ = 0; lowest_cost = Inf; shouldinline = false
+    anyisbit = any(ls.loopindexesbit)
     for newu₂ ∈ lo.syms
         reject_reorder(ls, newu₂) && continue
         for newu₁ ∈ lo.syms#@view(new_order[nt+1:end])
@@ -1207,7 +1229,11 @@ function choose_tile(ls::LoopSet)
             while true
                 for new_vec ∈ new_order # view to skip first
                     reject_reorder(ls, new_vec) && continue
-                    u₁temp, u₂temp, cost_temp, shouldinline_temp = evaluate_cost_tile(ls, new_order, UnrollSymbols(newu₁, newu₂, new_vec))
+                    if anyisbit && ls.loopindexesbit[getloopid(ls,new_vec)]
+                        # ((new_vec === newu₁) || (new_vec === newu₂)) || continue
+                        (new_vec === newu₁) || continue
+                    end
+                    u₁temp, u₂temp, cost_temp, shouldinline_temp = evaluate_cost_tile(ls, new_order, UnrollSymbols(newu₁, newu₂, new_vec), anyisbit)
                     # if cost_temp < lowest_cost # leads to 4 vmovapds
                     if cost_temp ≤ lowest_cost # lead to 2 vmovapds
                         lowest_cost = cost_temp
diff --git a/src/modeling/graphs.jl b/src/modeling/graphs.jl
@@ -441,6 +441,7 @@ struct LoopSet
     equalarraydims::Vector{Tuple{Vector{Symbol},Vector{Int}}}
     omop::OffsetLoadCollection
     loopordermap::Vector{Int}
+    loopindexesbit::Vector{Bool}
     mod::Symbol
 end
 
@@ -562,7 +563,7 @@ function LoopSet(mod::Symbol)
         Ref(-1), # Ureduct
         Tuple{Vector{Symbol},Vector{Int}}[],
         OffsetLoadCollection(),
-        Int[],
+        Int[], Bool[],
         mod
     )
 end
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
@@ -181,6 +181,14 @@ function add_mref!(
     end
     add_mref_ptr!(sptrs, ls, ar, Tsym, C, B, sp, name)
 end
+function loop_indexes_bit!(ls::LoopSet, ar::ArrayReferenceMeta)
+    li = ar.loopedindex;
+    ind = first(getindices(ar))
+    ind === DISCONTIGUOUS && return
+    first(li) || throw(LoopError("The contiguous index of a `BitArray` shouldn't be a complex function.", )ind)
+    ls.loopindexesbit[getloopid(ls,ind)] = true
+    nothing
+end
 function add_mref_ptr!(
     sptrs::Expr, ls::LoopSet, ar::ArrayReferenceMeta, Tsym::Symbol,
     C::Int, B::Int, sp::NTuple{N,Int}, name::Symbol
@@ -190,6 +198,8 @@ function add_mref_ptr!(
     column_major = ntuple(identity, N)
     li = ar.loopedindex;
     if sp === column_major || isone(length(li))
+        # don't set `bit` to true if our vector width is ≥ 8
+        ((Tsym === :Bit) && (ls.vector_width[] < 8)) && loop_indexes_bit!(ls, ar)
         return extract_gsp!(sptrs, name)
     end
     permute_mref!(ar, C, sp)
@@ -206,6 +216,7 @@ function add_mref_ptr!(
         push!(strd_tup.args, Expr(:call, gf, strides, p, false))
         push!(offsets_tup.args, Expr(:call, gf, offsets, p, false))
     end
+    #TODO: fix for `Tsym === Bit`.
     sptype = Expr(:curly, lv(:StridedPointer), Tsym, N, (C == -1 ? -1 : 1), B, column_major)
     sptr = Expr(:call, sptype, Expr(:call, :pointer, tmpsp), strd_tup, offsets_tup)
     pushpreamble!(ls, Expr(:(=), name, sptr))
@@ -527,11 +538,10 @@ function sizeofeltypes(v)::Int
     # sizeof(T)
 end
 
-function avx_loopset(
-    instr::Vector{Instruction}, ops::Vector{OperationStruct}, arf::Vector{ArrayRefStruct},
+function avx_loopset!(
+    ls::LoopSet, instr::Vector{Instruction}, ops::Vector{OperationStruct}, arf::Vector{ArrayRefStruct},
     AM::Vector{Any}, LPSYM::Vector{Any}, LB::Core.SimpleVector, vargs::Core.SimpleVector
 )
-    ls = LoopSet(:LoopVectorization)
     # TODO: check outer reduction types instead
     elementbytes = if length(vargs[1].parameters) > 0
         sizeofeltypes(vargs[1].parameters[1].parameters)
@@ -546,6 +556,7 @@ function avx_loopset(
     nopsv = NOpsType[calcnops(ls, op) for op in ops]
     expandedv = [isexpanded(ls, ops, nopsv, i) for i ∈ eachindex(ops)]
 
+    resize!(ls.loopindexesbit, length(ls.loops)); ls.loopindexesbit .= false;
     mrefs = create_mrefs!(ls, arf, arraysymbolinds, opsymbols, nopsv, expandedv, vargs[1])
     for mref ∈ mrefs
         push!(ls.includedactualarrays, vptr(mref))
@@ -588,15 +599,17 @@ function _avx_loopset(
     nops = length(OPSsv) ÷ 3
     instr = Instruction[Instruction(OPSsv[3i+1], OPSsv[3i+2]) for i ∈ 0:nops-1]
     ops = OperationStruct[ OPSsv[3i] for i ∈ 1:nops ]
-    ls = avx_loopset(
-        instr, ops,
+    ls = LoopSet(:LoopVectorization)
+    inline, u₁, u₂, W, rs, rc, cls, l1, l2, l3, nt = UNROLL
+    set_hw!(ls, rs, rc, cls, l1, l2, l3); ls.vector_width[] = W
+    avx_loopset!(
+        ls, instr, ops,
         ArrayRefStruct[ARFsv...],
         tovector(AMsv), tovector(LPSYMsv), LBsv, vargs
     )::LoopSet
-    inline, u₁, u₂, W, rs, rc, cls, l1, l2, l3, nt = UNROLL
-    set_hw!(ls, rs, rc, cls, l1, l2, l3); ls.vector_width[] = W
     ls
 end
+
 """
     _avx_!(unroll, ops, arf, am, lpsym, lb, vargs...)
 
@@ -619,7 +632,7 @@ Execute an `@avx` block. The block's code is represented via the arguments:
 @generated function _avx_!(
     ::Val{UNROLL}, ::Val{OPS}, ::Val{ARF}, ::Val{AM}, ::Val{LPSYM}, var"#lv#tuple#args#"::Tuple{LB,V}
 ) where {UNROLL, OPS, ARF, AM, LPSYM, LB, V}
-    # 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
+    1 + 1 # Irrelevant line you can comment out/in to force recompilation...
     ls = _avx_loopset(OPS, ARF, AM, LPSYM, LB.parameters, V.parameters, UNROLL)
     # return @show avx_body(ls, UNROLL)
     if last(UNROLL) > 1
diff --git a/test/gemm.jl b/test/gemm.jl
@@ -71,7 +71,7 @@
         end
     end
     function AmulBavx1!(C, A, B)
-        @avx unroll=(1,2) for m ∈ 1:size(A,1), n ∈ axes(B,2)
+        @avx for m ∈ 1:size(A,1), n ∈ axes(B,2)
             Cₘₙ = zero(eltype(C))
             for k ∈ 1:size(A,2)
                 Cₘₙ += A[m,k] * B[k,n]
@@ -624,7 +624,8 @@
         # @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 1, 8)
         @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :k, :n, :m, 1, 8)
     elseif LoopVectorization.register_count() == 16
-        @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 2, 4)
+        # @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 2, 4)
+        @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :n, :m, :n, 2, 4)
     elseif LoopVectorization.register_count() == 8
         @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 1, 4)
     end
diff --git a/test/gemv.jl b/test/gemv.jl
@@ -259,8 +259,10 @@ using Test
             y1 = view(y1full, M .+ (1:M));
             y2 = view(y2full, M .+ (1:M));
             Abit = A .> 0.5;
-            fill!(y2, -9999); mygemv_avx!(y2, Abit, x);
-            @test y2 ≈ Abit * x
+            if LoopVectorization.pick_vector_width(T) ≥ 8
+                fill!(y2, -9999); mygemv_avx!(y2, Abit, x);
+                @test y2 ≈ Abit * x
+            end
             fill!(y2, -9999); mygemvavx!(y2, Abit, x);
             @test y2 ≈ Abit * x
             xbit = x .> 0.5;
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
@@ -67,7 +67,8 @@ using Test
                 end)
     lssubcol = LoopVectorization.loopset(subcolq);
     # @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :i, Symbol("##undefined##"), :j, 1, -1)
-    @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 1, 8)
+    # @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 1, 8)
+    @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 1, 6)
 
 
     # if LoopVectorization.register_count() != 8