Skip to content

Commit 49d5ae9

Browse files
committed
Instead of the hack of increasing W to 8 when any array has bits, instead when a vectorized axis loads/stores bits
1 parent 21a9872 commit 49d5ae9

File tree

8 files changed

+81
-37
lines changed

8 files changed

+81
-37
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ SLEEFPirates = "0.6.12"
2828
Static = "0.2"
2929
ThreadingUtilities = "0.3"
3030
UnPack = "1"
31-
VectorizationBase = "0.19.5"
31+
VectorizationBase = "0.19.6"
3232
julia = "1.5"
3333

3434
[extras]

src/condense_loopset.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ function check_device(x)
399399
@info """`LoopVectorization.check_args` returned `false`, because `ArrayInterface.device(::$(typeof(x))) == $x`
400400
`LoopVectorization` normally requires `ArrayInterface.CPUPointer` (exceptions include ranges, `BitVector`s, and
401401
`BitArray`s whose number of rows is a multiple of 8). Therefore compiling a probably slow `@inbounds @fastmath` fallback loop.""" maxlog=1
402-
false
402+
false
403403
end
404404

405405
function check_args_call(ls::LoopSet)

src/modeling/determinestrategy.jl

Lines changed: 47 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,6 @@ end
125125
function biggest_type_size(ls::LoopSet)
126126
maximum(elsize, operations(ls))
127127
end
128-
# function VectorizationBase.pick_vector_width(ls::LoopSet, u::Symbol)
129-
# VectorizationBase.pick_vector_width(length(ls, u), biggest_type_size(ls))
130-
# end
131-
# function VectorizationBase.pick_vector_width_shift(ls::LoopSet, u::Symbol)
132-
# VectorizationBase.pick_vector_width_shift(length(ls, u), biggest_type_size(ls))
133-
# end
134128
function hasintersection(a, b)
135129
for aᵢ a, bᵢ b
136130
aᵢ === bᵢ && return true
@@ -242,7 +236,6 @@ end
242236
function unroll_no_reductions(ls, order, vloopsym)
243237
size_T = biggest_type_size(ls)
244238
W, Wshift = lsvecwidthshift(ls, vloopsym, size_T)
245-
# W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vloopsym), size_T)::Tuple{Int,Int}
246239

247240
compute_rt = load_rt = store_rt = 0.0
248241
unrolled = last(order)
@@ -361,11 +354,25 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::S
361354
num_reductions = count_reductions(ls)
362355
# The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
363356
# The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
364-
if iszero(num_reductions)
365-
# if only 1 loop, no need to unroll
366-
# if more than 1 loop, there is some cost. Picking 2 here as a heuristic.
367-
return unroll_no_reductions(ls, order, vloopsym)
357+
loopindexesbit = ls.loopindexesbit
358+
if iszero(length(loopindexesbit)) || ((!loopindexesbit[getloopid(ls, vloopsym)]))
359+
if iszero(num_reductions)
360+
return unroll_no_reductions(ls, order, vloopsym)
361+
else
362+
return determine_unroll_factor(ls, order, vloopsym, num_reductions)
363+
end
364+
elseif iszero(num_reductions)
365+
return 8 ÷ ls.vector_width[], vloopsym
366+
else
367+
rttemp, ltemp = determine_unroll_factor(ls, order, vloopsym, vloopsym)
368+
UF = min(8, VectorizationBase.nextpow2(max(1, round(Int, ltemp / (rttemp * num_reductions) ) )))
369+
UFfactor = 8 ÷ ls.vector_width[]
370+
cld(UF, UFfactor)*UFfactor, vloopsym
368371
end
372+
end
373+
# function scale_unrolled()
374+
# end
375+
function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::Symbol, num_reductions::Int)
369376
innermost_loop = last(order)
370377
rt = Inf; rtcomp = Inf; latency = Inf; best_unrolled = Symbol("")
371378
for unrolled order
@@ -533,12 +540,17 @@ function solve_unroll(
533540
W::Int, vloopsym::Symbol, rounduᵢ::Int
534541
)
535542
(u₁step, u₂step) = if rounduᵢ == 1 # max is to safeguard against some weird arch I've never heard of.
536-
(max(1,cache_lnsze(ls) ÷ reg_size(ls)), 1)
543+
(max(1, cache_lnsze(ls) ÷ reg_size(ls)), 1)
537544
elseif rounduᵢ == 2
538545
(1, max(1,cache_lnsze(ls) ÷ reg_size(ls)))
546+
elseif rounduᵢ == -1
547+
(8 ÷ ls.vector_width[], 1)
548+
elseif rounduᵢ == -2
549+
(1, 8 ÷ ls.vector_width[])
539550
else
540551
(1, 1)
541552
end
553+
# @show u₁step, u₂step
542554
u₁loop = getloop(ls, u₁loopsym)
543555
u₂loop = getloop(ls, u₂loopsym)
544556
solve_unroll(
@@ -921,7 +933,7 @@ end
921933
# But optimal order within tile must still be determined
922934
# as well as size of the tiles.
923935
function evaluate_cost_tile(
924-
ls::LoopSet, order::Vector{Symbol}, unrollsyms::UnrollSymbols
936+
ls::LoopSet, order::Vector{Symbol}, unrollsyms::UnrollSymbols, anyisbit::Bool
925937
)
926938
N = length(order)
927939
@assert N 2 "Cannot tile merely $N loops!"
@@ -940,7 +952,6 @@ function evaluate_cost_tile(
940952
# Need to check if fusion is possible
941953
size_T = biggest_type_size(ls)
942954
W, Wshift = lsvecwidthshift(ls, vloopsym, size_T)
943-
# W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vloopsym), size_T)::Tuple{Int,Int}
944955
# costs =
945956
# cost_mat[1] / ( unrolled * u₂loopsym)
946957
# cost_mat[2] / ( u₂loopsym)
@@ -1019,10 +1030,8 @@ function evaluate_cost_tile(
10191030
#elseif isconstant(op)
10201031
end
10211032
rt, lat, rp = cost(ls, op, (u₁loopsym, u₂loopsym), vloopsym, Wshift, size_T)
1022-
if isload(op)
1023-
if !prefetch_good_idea
1024-
prefetch_good_idea = prefetchisagoodidea(ls, op, UnrollArgs(ls, 4, unrollsyms, 4, 0)) 0
1025-
end
1033+
if isload(op) & (!prefetch_good_idea)
1034+
prefetch_good_idea = prefetchisagoodidea(ls, op, UnrollArgs(ls, 4, unrollsyms, 4, 0)) 0
10261035
end
10271036
# rp = (opisininnerloop && !(loadintostore(ls, op))) ? rp : zero(rp) # we only care about register pressure within the inner most loop
10281037
rp = opisininnerloop ? rp : zero(rp) # we only care about register pressure within the inner most loop
@@ -1041,10 +1050,22 @@ function evaluate_cost_tile(
10411050
# reg_pres[4] == remaining_registers
10421051
costpenalty = ((reg_pressure[1] + reg_pressure[2] + reg_pressure[3]) > reg_pressure[4]) ? 2 : 1
10431052
u₁v = vloopsym === u₁loopsym; u₂v = vloopsym === u₂loopsym
1044-
round_uᵢ = prefetch_good_idea ? (u₁v ? 1 : (u₂v ? 2 : 0)) : 0
1053+
visbit = anyisbit && ls.loopindexesbit[getloopid(ls,vloopsym)]
1054+
round_uᵢ = if visbit
1055+
(u₁v ? -1 : (u₂v ? -2 : 0))
1056+
elseif prefetch_good_idea
1057+
(u₁v ? 1 : (u₂v ? 2 : 0))
1058+
else
1059+
0
1060+
end
10451061
# @show (irreducible_storecosts / sum(cost_vec))
10461062
if (irreducible_storecosts / sum(cost_vec) 0.5) && !any(op -> loadintostore(ls, op), operations(ls))
1047-
u₁, u₂ = (1, 1)
1063+
u₁, u₂ = if visbit
1064+
vecsforbyte = 8 ÷ ls.vector_width[]
1065+
u₁v ? (vecsforbyte,1) : (1,vecsforbyte)
1066+
else
1067+
(1, 1)
1068+
end
10481069
ucost = unroll_cost(cost_vec, 1, 1, length(getloop(ls, u₁loopsym)), length(getloop(ls, u₂loopsym)))
10491070
else
10501071
u₁, u₂, ucost = solve_unroll(ls, u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vloopsym, round_uᵢ)
@@ -1198,6 +1219,7 @@ function choose_tile(ls::LoopSet)
11981219
best_order = copyto!(ls.loop_order.bestorder, lo.syms)
11991220
bestu₁ = bestu₂ = best_vec = first(best_order) # filler
12001221
u₁ = u₂ = 0; lowest_cost = Inf; shouldinline = false
1222+
anyisbit = any(ls.loopindexesbit)
12011223
for newu₂ lo.syms
12021224
reject_reorder(ls, newu₂) && continue
12031225
for newu₁ lo.syms#@view(new_order[nt+1:end])
@@ -1207,7 +1229,11 @@ function choose_tile(ls::LoopSet)
12071229
while true
12081230
for new_vec new_order # view to skip first
12091231
reject_reorder(ls, new_vec) && continue
1210-
u₁temp, u₂temp, cost_temp, shouldinline_temp = evaluate_cost_tile(ls, new_order, UnrollSymbols(newu₁, newu₂, new_vec))
1232+
if anyisbit && ls.loopindexesbit[getloopid(ls,new_vec)]
1233+
# ((new_vec === newu₁) || (new_vec === newu₂)) || continue
1234+
(new_vec === newu₁) || continue
1235+
end
1236+
u₁temp, u₂temp, cost_temp, shouldinline_temp = evaluate_cost_tile(ls, new_order, UnrollSymbols(newu₁, newu₂, new_vec), anyisbit)
12111237
# if cost_temp < lowest_cost # leads to 4 vmovapds
12121238
if cost_temp lowest_cost # lead to 2 vmovapds
12131239
lowest_cost = cost_temp

src/modeling/graphs.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,7 @@ struct LoopSet
441441
equalarraydims::Vector{Tuple{Vector{Symbol},Vector{Int}}}
442442
omop::OffsetLoadCollection
443443
loopordermap::Vector{Int}
444+
loopindexesbit::Vector{Bool}
444445
mod::Symbol
445446
end
446447

@@ -562,7 +563,7 @@ function LoopSet(mod::Symbol)
562563
Ref(-1), # Ureduct
563564
Tuple{Vector{Symbol},Vector{Int}}[],
564565
OffsetLoadCollection(),
565-
Int[],
566+
Int[], Bool[],
566567
mod
567568
)
568569
end

src/reconstruct_loopset.jl

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,14 @@ function add_mref!(
181181
end
182182
add_mref_ptr!(sptrs, ls, ar, Tsym, C, B, sp, name)
183183
end
184+
function loop_indexes_bit!(ls::LoopSet, ar::ArrayReferenceMeta)
185+
li = ar.loopedindex;
186+
ind = first(getindices(ar))
187+
ind === DISCONTIGUOUS && return
188+
first(li) || throw(LoopError("The contiguous index of a `BitArray` shouldn't be a complex function.", )ind)
189+
ls.loopindexesbit[getloopid(ls,ind)] = true
190+
nothing
191+
end
184192
function add_mref_ptr!(
185193
sptrs::Expr, ls::LoopSet, ar::ArrayReferenceMeta, Tsym::Symbol,
186194
C::Int, B::Int, sp::NTuple{N,Int}, name::Symbol
@@ -190,6 +198,8 @@ function add_mref_ptr!(
190198
column_major = ntuple(identity, N)
191199
li = ar.loopedindex;
192200
if sp === column_major || isone(length(li))
201+
# don't set `bit` to true if our vector width is ≥ 8
202+
((Tsym === :Bit) && (ls.vector_width[] < 8)) && loop_indexes_bit!(ls, ar)
193203
return extract_gsp!(sptrs, name)
194204
end
195205
permute_mref!(ar, C, sp)
@@ -206,6 +216,7 @@ function add_mref_ptr!(
206216
push!(strd_tup.args, Expr(:call, gf, strides, p, false))
207217
push!(offsets_tup.args, Expr(:call, gf, offsets, p, false))
208218
end
219+
#TODO: fix for `Tsym === Bit`.
209220
sptype = Expr(:curly, lv(:StridedPointer), Tsym, N, (C == -1 ? -1 : 1), B, column_major)
210221
sptr = Expr(:call, sptype, Expr(:call, :pointer, tmpsp), strd_tup, offsets_tup)
211222
pushpreamble!(ls, Expr(:(=), name, sptr))
@@ -527,11 +538,10 @@ function sizeofeltypes(v)::Int
527538
# sizeof(T)
528539
end
529540

530-
function avx_loopset(
531-
instr::Vector{Instruction}, ops::Vector{OperationStruct}, arf::Vector{ArrayRefStruct},
541+
function avx_loopset!(
542+
ls::LoopSet, instr::Vector{Instruction}, ops::Vector{OperationStruct}, arf::Vector{ArrayRefStruct},
532543
AM::Vector{Any}, LPSYM::Vector{Any}, LB::Core.SimpleVector, vargs::Core.SimpleVector
533544
)
534-
ls = LoopSet(:LoopVectorization)
535545
# TODO: check outer reduction types instead
536546
elementbytes = if length(vargs[1].parameters) > 0
537547
sizeofeltypes(vargs[1].parameters[1].parameters)
@@ -546,6 +556,7 @@ function avx_loopset(
546556
nopsv = NOpsType[calcnops(ls, op) for op in ops]
547557
expandedv = [isexpanded(ls, ops, nopsv, i) for i eachindex(ops)]
548558

559+
resize!(ls.loopindexesbit, length(ls.loops)); ls.loopindexesbit .= false;
549560
mrefs = create_mrefs!(ls, arf, arraysymbolinds, opsymbols, nopsv, expandedv, vargs[1])
550561
for mref mrefs
551562
push!(ls.includedactualarrays, vptr(mref))
@@ -588,15 +599,17 @@ function _avx_loopset(
588599
nops = length(OPSsv) ÷ 3
589600
instr = Instruction[Instruction(OPSsv[3i+1], OPSsv[3i+2]) for i 0:nops-1]
590601
ops = OperationStruct[ OPSsv[3i] for i 1:nops ]
591-
ls = avx_loopset(
592-
instr, ops,
602+
ls = LoopSet(:LoopVectorization)
603+
inline, u₁, u₂, W, rs, rc, cls, l1, l2, l3, nt = UNROLL
604+
set_hw!(ls, rs, rc, cls, l1, l2, l3); ls.vector_width[] = W
605+
avx_loopset!(
606+
ls, instr, ops,
593607
ArrayRefStruct[ARFsv...],
594608
tovector(AMsv), tovector(LPSYMsv), LBsv, vargs
595609
)::LoopSet
596-
inline, u₁, u₂, W, rs, rc, cls, l1, l2, l3, nt = UNROLL
597-
set_hw!(ls, rs, rc, cls, l1, l2, l3); ls.vector_width[] = W
598610
ls
599611
end
612+
600613
"""
601614
_avx_!(unroll, ops, arf, am, lpsym, lb, vargs...)
602615
@@ -619,7 +632,7 @@ Execute an `@avx` block. The block's code is represented via the arguments:
619632
@generated function _avx_!(
620633
::Val{UNROLL}, ::Val{OPS}, ::Val{ARF}, ::Val{AM}, ::Val{LPSYM}, var"#lv#tuple#args#"::Tuple{LB,V}
621634
) where {UNROLL, OPS, ARF, AM, LPSYM, LB, V}
622-
# 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
635+
1 + 1 # Irrelevant line you can comment out/in to force recompilation...
623636
ls = _avx_loopset(OPS, ARF, AM, LPSYM, LB.parameters, V.parameters, UNROLL)
624637
# return @show avx_body(ls, UNROLL)
625638
if last(UNROLL) > 1

test/gemm.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@
7171
end
7272
end
7373
function AmulBavx1!(C, A, B)
74-
@avx unroll=(1,2) for m 1:size(A,1), n axes(B,2)
74+
@avx for m 1:size(A,1), n axes(B,2)
7575
Cₘₙ = zero(eltype(C))
7676
for k 1:size(A,2)
7777
Cₘₙ += A[m,k] * B[k,n]
@@ -624,7 +624,8 @@
624624
# @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 1, 8)
625625
@test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :k, :n, :m, 1, 8)
626626
elseif LoopVectorization.register_count() == 16
627-
@test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 2, 4)
627+
# @test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 2, 4)
628+
@test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :n, :m, :n, 2, 4)
628629
elseif LoopVectorization.register_count() == 8
629630
@test LoopVectorization.choose_order(lsAtmulBt8) == ([:n, :m, :k], :m, :n, :m, 1, 4)
630631
end

test/gemv.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,10 @@ using Test
259259
y1 = view(y1full, M .+ (1:M));
260260
y2 = view(y2full, M .+ (1:M));
261261
Abit = A .> 0.5;
262-
fill!(y2, -9999); mygemv_avx!(y2, Abit, x);
263-
@test y2 Abit * x
262+
if LoopVectorization.pick_vector_width(T) 8
263+
fill!(y2, -9999); mygemv_avx!(y2, Abit, x);
264+
@test y2 Abit * x
265+
end
264266
fill!(y2, -9999); mygemvavx!(y2, Abit, x);
265267
@test y2 Abit * x
266268
xbit = x .> 0.5;

test/miscellaneous.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ using Test
6767
end)
6868
lssubcol = LoopVectorization.loopset(subcolq);
6969
# @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :i, Symbol("##undefined##"), :j, 1, -1)
70-
@test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 1, 8)
70+
# @test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 1, 8)
71+
@test LoopVectorization.choose_order(lssubcol) == (Symbol[:i,:j], :j, :i, :j, 1, 6)
7172

7273

7374
# if LoopVectorization.register_count() != 8

0 commit comments

Comments
 (0)