Skip to content

Commit 1665bce

Browse files
committed
Further improvements to stride-penalties. Now just the dot product of roughly estimated array strides and loop repetitions (uncorrected by unrolling factors; should probably make those adjustments?)
1 parent b588263 commit 1665bce

File tree

5 files changed

+41
-31
lines changed

5 files changed

+41
-31
lines changed

docs/src/examples/dot_product.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ Double precision benchmarks pitting Julia's builtin dot product (named `MKL` her
2828
What we just described is the core of the approach used by all these compilers. The variation in results is explained mostly by how they handle vectors with lengths that are not an integer multiple of `W`. I ran these on a computer with AVX512 so that `W = 8`. LLVM, the backend compiler of both Julia and Clang, shows rapid performance degredation as `N % 4W` increases, where `N` is the length of the vectors.
2929
This is because, to handle the remainder, it uses a scalar loop that runs as written: multiply and add single elements, one after the other.
3030

31-
GCC (gfortran) stumbles in throughput, because it does not use separate accumulation vectors.
31+
Initially, GCC (gfortran) stumbled in throughput, because it does not use separate accumulation vectors by default except on Power, even with `-funroll-loops`.
32+
I compiled with the flags `-fvariable-expansion-in-unroller --param max-variable-expansions-in-unroller=4` to allow for 4 accumulation vectors, yielding good performance.
3233

3334
The Intel compilers have a secondary vectorized loop without any additional unrolling that masks off excess lanes beyond `N` (for when `N` isn't an integer multiple of `W`).
3435
LoopVectorization uses `if/ifelse` checks to determine how many extra vectors are needed, the last of which is masked.

src/determinestrategy.jl

Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -480,39 +480,43 @@ function set_upstream_family!(adal::Vector{T}, op::Operation, val::T) where {T}
480480
set_upstream_family!(adal, opp, val)
481481
end
482482
end
483-
function stride_penalty_opdependent(ls::LoopSet, op::Operation, order::Vector{Symbol}, contigsym::Symbol)
484-
num_loops = length(order)
485-
firstloopdeps = loopdependencies(findparent(ls, contigsym))
486-
iter = 1
487-
for i 0:num_loops - 1
488-
loopsym = order[num_loops - i]
489-
loopsym firstloopdeps && return iter
490-
iter *= length(getloop(ls, loopsym))
491-
end
492-
iter
493-
end
494-
function stride_penalty(ls::LoopSet, op::Operation, order::Vector{Symbol})
495-
num_loops = length(order)
496-
contigsym = first(loopdependencies(op.ref))
497-
contigsym == Symbol("##DISCONTIGUOUSSUBARRAY##") && return 0
498-
first(op.ref.loopedindex) || return stride_penalty_opdependent(ls, op, order, contigsym)
499-
iter = 1
500-
for i 0:num_loops - 1
501-
loopsym = order[num_loops - i]
502-
if loopsym === contigsym
503-
return iter
504-
elseif loopsym loopdependencies(op)
505-
iter *= length(getloop(ls, loopsym))
483+
function stride_penalty(ls::LoopSet, op::Operation, order::Vector{Symbol}, loopfreqs)
484+
loopdeps = @view(loopdependencies(op.ref)[1:end])
485+
if !first(op.ref.loopedindex)
486+
loopdeps = @view(loopdependencies(findparent(ls, first(loopdeps)))[1:end])
487+
end
488+
opstrides = Vector{Int}(undef, length(loopdeps))
489+
# very minor stride assumption here, because we don't really want to base optimization decisions on it...
490+
if first(loopdeps) == Symbol("##DISCONTIGUOUSSUBARRAY##")
491+
loopdeps = @view(parent(loopdeps)[2:end])
492+
opstrides[1] = 2.0
493+
else
494+
opstrides[1] = 1.0
495+
end
496+
# loops = map(s -> getloop(ls, s), loopdeps)
497+
for i 2:length(loopdeps)
498+
opstrides[i] = opstrides[i-1] * length(getloop(ls, loopdeps[i-1]))
499+
# opstrides[i] = opstrides[i-1] * length(loops[i-1])
500+
end
501+
penalty = 0.0
502+
for i eachindex(order)
503+
id = findfirst(isequal(order[i]), loopdeps)
504+
if !isnothing(id)
505+
penalty += loopfreqs[i] * opstrides[id]
506506
end
507507
end
508-
iter
508+
penalty
509509
end
510510
function stride_penalty(ls::LoopSet, order::Vector{Symbol})
511-
stridepenalty = 0
512-
total_iter = prod(length, ls.loops)
511+
stridepenalty = 0.0
512+
loopfreqs = Vector{Int}(undef, length(order))
513+
loopfreqs[1] = 1
514+
for i 2:length(order)
515+
loopfreqs[i] = loopfreqs[i-1] * length(getloop(ls, order[i]))
516+
end
513517
for op operations(ls)
514518
if accesses_memory(op)
515-
stridepenalty += stride_penalty(ls, op, order)
519+
stridepenalty += stride_penalty(ls, op, order, loopfreqs)
516520
end
517521
end
518522
stridepenalty# * 1e-9

src/reconstruct_loopset.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ Execute an `@avx` block. The block's code is represented via the arguments:
492492
- `vargs...` holds the encoded pointers of all the arrays (see `VectorizationBase`'s various pointer types).
493493
"""
494494
@generated function _avx_!(::Val{UNROLL}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB, vargs...) where {UNROLL, OPS, ARF, AM, LPSYM, LB}
495-
# 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
495+
1 + 1 # Irrelevant line you can comment out/in to force recompilation...
496496
ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs)
497497
# @show avx_body(ls, UNROLL)
498498
# @show UNROLL, OPS, ARF, AM, LPSYM, LB

src/split_loops.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
9999
order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost(ls_2)
100100
# U_1 = T_1 = U_2 = T_2 = 2
101101
if cost_1 + cost_2 cost_fused
102-
@show cost_1, cost_2 cost_fused
102+
# @show cost_1, cost_2 cost_fused
103103
ls_2_lowered = if length(remaining_ops) > 1
104104
inline = iszero(inline) ? (shouldinline_1 % Int) : inline
105105
lower_and_split_loops(ls_2, inline)

test/miscellaneous.jl

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,12 @@ using Test
7272
# # @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, Unum, Tnum)
7373
# @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, :i, :j, 1, 1)
7474
# end
75-
@test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :i, Symbol("##undefined##"), :j, 4, -1)
75+
# @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :i, Symbol("##undefined##"), :j, 4, -1)
76+
if LoopVectorization.REGISTER_COUNT == 32
77+
@test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 2, 10)
78+
elseif LoopVectorization.REGISTER_COUNT == 16
79+
@test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 2, 6)
80+
end
7681
# @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
7782
## @avx is SLOWER!!!!
7883
## need to fix!

0 commit comments

Comments
 (0)