Further improvements to stride-penalties. Now just the dot product of roughly estimated array strides and loop repetitions (uncorrected by unrolling factors; should probably make those adjustments?)

chriselrod · chriselrod · commit 1665bce9c5db · 2020-06-29T12:52:04.000-04:00
diff --git a/docs/src/examples/dot_product.md b/docs/src/examples/dot_product.md
@@ -28,7 +28,8 @@ Double precision benchmarks pitting Julia's builtin dot product (named `MKL` her
 What we just described is the core of the approach used by all these compilers. The variation in results is explained mostly by how they handle vectors with lengths that are not an integer multiple of `W`. I ran these on a computer with AVX512 so that `W = 8`. LLVM, the backend compiler of both Julia and Clang, shows rapid performance degredation as `N % 4W` increases, where `N` is the length of the vectors.
 This is because, to handle the remainder, it uses a scalar loop that runs as written: multiply and add single elements, one after the other. 
 
-GCC (gfortran) stumbles in throughput, because it does not use separate accumulation vectors.
+Initially, GCC (gfortran) stumbled in throughput, because it does not use separate accumulation vectors by default except on Power, even with `-funroll-loops`.
+I compiled with the flags `-fvariable-expansion-in-unroller --param max-variable-expansions-in-unroller=4` to allow for 4 accumulation vectors, yielding good performance.
 
 The Intel compilers have a secondary vectorized loop without any additional unrolling that masks off excess lanes beyond `N` (for when `N` isn't an integer multiple of `W`).
 LoopVectorization uses `if/ifelse` checks to determine how many extra vectors are needed, the last of which is masked.
diff --git a/src/determinestrategy.jl b/src/determinestrategy.jl
@@ -480,39 +480,43 @@ function set_upstream_family!(adal::Vector{T}, op::Operation, val::T) where {T}
         set_upstream_family!(adal, opp, val)
     end
 end
-function stride_penalty_opdependent(ls::LoopSet, op::Operation, order::Vector{Symbol}, contigsym::Symbol)
-    num_loops = length(order)
-    firstloopdeps = loopdependencies(findparent(ls, contigsym))
-    iter = 1
-    for i ∈ 0:num_loops - 1
-        loopsym = order[num_loops - i]
-        loopsym ∈ firstloopdeps && return iter
-        iter *= length(getloop(ls, loopsym))
-    end
-    iter
-end
-function stride_penalty(ls::LoopSet, op::Operation, order::Vector{Symbol})
-    num_loops = length(order)
-    contigsym = first(loopdependencies(op.ref))
-    contigsym == Symbol("##DISCONTIGUOUSSUBARRAY##") && return 0
-    first(op.ref.loopedindex) || return stride_penalty_opdependent(ls, op, order, contigsym)
-    iter = 1
-    for i ∈ 0:num_loops - 1
-        loopsym = order[num_loops - i]
-        if loopsym === contigsym
-            return iter
-        elseif loopsym ∈ loopdependencies(op)
-            iter *= length(getloop(ls, loopsym))
+function stride_penalty(ls::LoopSet, op::Operation, order::Vector{Symbol}, loopfreqs)
+    loopdeps = @view(loopdependencies(op.ref)[1:end])
+    if !first(op.ref.loopedindex)
+        loopdeps = @view(loopdependencies(findparent(ls, first(loopdeps)))[1:end])
+    end
+    opstrides = Vector{Int}(undef, length(loopdeps))
+    # very minor stride assumption here, because we don't really want to base optimization decisions on it...
+    if first(loopdeps) == Symbol("##DISCONTIGUOUSSUBARRAY##")
+        loopdeps = @view(parent(loopdeps)[2:end])
+        opstrides[1] = 2.0
+    else
+        opstrides[1] = 1.0
+    end
+    # loops = map(s -> getloop(ls, s), loopdeps)
+    for i ∈ 2:length(loopdeps)
+        opstrides[i] = opstrides[i-1] * length(getloop(ls, loopdeps[i-1]))
+        # opstrides[i] = opstrides[i-1] * length(loops[i-1])
+    end
+    penalty = 0.0
+    for i ∈ eachindex(order)
+        id = findfirst(isequal(order[i]), loopdeps)
+        if !isnothing(id)
+            penalty += loopfreqs[i] * opstrides[id]
         end
     end
-    iter
+    penalty
 end
 function stride_penalty(ls::LoopSet, order::Vector{Symbol})
-    stridepenalty = 0
-    total_iter = prod(length, ls.loops)
+    stridepenalty = 0.0
+    loopfreqs = Vector{Int}(undef, length(order))
+    loopfreqs[1] = 1
+    for i ∈ 2:length(order)
+        loopfreqs[i] = loopfreqs[i-1] * length(getloop(ls, order[i]))
+    end
     for op ∈ operations(ls)
         if accesses_memory(op)
-            stridepenalty += stride_penalty(ls, op, order)
+            stridepenalty += stride_penalty(ls, op, order, loopfreqs)
         end
     end
     stridepenalty# * 1e-9
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
@@ -492,7 +492,7 @@ Execute an `@avx` block. The block's code is represented via the arguments:
 - `vargs...` holds the encoded pointers of all the arrays (see `VectorizationBase`'s various pointer types).
 """
 @generated function _avx_!(::Val{UNROLL}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB, vargs...) where {UNROLL, OPS, ARF, AM, LPSYM, LB}
-    # 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
+    1 + 1 # Irrelevant line you can comment out/in to force recompilation...
     ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs)
     # @show avx_body(ls, UNROLL)
     # @show UNROLL, OPS, ARF, AM, LPSYM, LB
diff --git a/src/split_loops.jl b/src/split_loops.jl
@@ -99,7 +99,7 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
         order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost(ls_2)
         # U_1 = T_1 = U_2 = T_2 = 2
         if cost_1 + cost_2 ≤ cost_fused
-            @show cost_1, cost_2 cost_fused
+            # @show cost_1, cost_2 cost_fused
             ls_2_lowered = if length(remaining_ops) > 1
                 inline = iszero(inline) ? (shouldinline_1 % Int) : inline
                 lower_and_split_loops(ls_2, inline)
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
@@ -72,7 +72,12 @@ using Test
     #     # @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, Unum, Tnum)
     #     @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, 1, 1)
     # end
-    @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :i, Symbol("##undefined##"), :j, 4, -1)
+    # @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :i, Symbol("##undefined##"), :j, 4, -1)
+    if LoopVectorization.REGISTER_COUNT == 32
+        @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 2, 10)
+    elseif LoopVectorization.REGISTER_COUNT == 16
+        @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 2, 6)
+    end
     # @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
     ## @avx is SLOWER!!!!
     ## need to fix!