Skip to content

Commit 01ad79b

Browse files
committed
Prepare for CheapThreads -> Polyester, add upperboundedintegers
1 parent e61d37f commit 01ad79b

File tree

8 files changed

+113
-39
lines changed

8 files changed

+113
-39
lines changed

src/LoopVectorization.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ using VectorizationBase: register_size, register_count, cache_linesize, cache_si
2727
using IfElse: ifelse
2828

2929
using ThreadingUtilities, CheapThreads
30+
const Polyester = CheapThreads
31+
# using ThreadingUtilities, Polyester
3032
using Base.Broadcast: Broadcasted, DefaultArrayStyle
3133
using LinearAlgebra: Adjoint, Transpose, Diagonal
3234
using Base.Meta: isexpr

src/codegen/lower_load.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ function prefetchisagoodidea(ls::LoopSet, op::Operation, td::UnrollArgs)
3030
end
3131
end
3232
innermostloopind == -1 && return 0
33-
if prod(s -> Float64(looplength(ls, s)), @view(indices[1:innermostloopind-1])) 120.0 && length(getloop(ls, innermostloopsym)) 120
33+
# if prod(s -> Float64(looplength(ls, s)), @view(indices[1:innermostloopind-1])) ≥ 120.0 &&
34+
if length(getloop(ls, innermostloopsym)) 120
3435
if getoffsets(op)[innermostloopind] < 120
3536
for opp operations(ls)
3637
if iscompute(opp) && (innermostloopsym loopdependencies(opp)) &&

src/codegen/lower_threads.jl

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -378,12 +378,12 @@ function thread_one_loops_expr(
378378
$loopstart
379379
var"##do#thread##" = var"#nrequest#" 0x00000000
380380
if var"##do#thread##"
381-
var"#threads#", var"#torelease#" = CheapThreads.request_threads(Threads.threadid()%UInt32, var"#nrequest#")
381+
var"#threads#", var"#torelease#" = Polyester.request_threads(Threads.threadid()%UInt32, var"#nrequest#")
382382
var"#thread#factor#0#" = var"#nthreads#"
383383
$iterdef
384384
var"#thread#launch#count#" = 0x00000000
385385
var"#thread#id#" = 0x00000000
386-
var"#thread#mask#" = CheapThreads.mask(var"#threads#")
386+
var"#thread#mask#" = Polyester.mask(var"#threads#")
387387
var"#threads#remain#" = true
388388
while var"#threads#remain#"
389389
VectorizationBase.assume(var"#thread#mask#" zero(var"#thread#mask#"))
@@ -409,14 +409,14 @@ function thread_one_loops_expr(
409409
var"#threads#remain#" = var"#thread#launch#count#" var"#nrequest#"
410410
end
411411
else# eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
412-
var"#torelease#" = zero(CheapThreads.worker_type())
413-
var"#threads#" = CheapThreads.UnsignedIteratorEarlyStop(var"#torelease#", 0x00000000)
412+
var"#torelease#" = zero(Polyester.worker_type())
413+
var"#threads#" = Polyester.UnsignedIteratorEarlyStop(var"#torelease#", 0x00000000)
414414
end
415415
var"#avx#call#args#" = $avxcall_args
416416
$_avx_call_
417417
var"##do#thread##" || $retexpr
418418
var"#thread#id#" = 0x00000000
419-
var"#thread#mask#" = CheapThreads.mask(var"#threads#")
419+
var"#thread#mask#" = Polyester.mask(var"#threads#")
420420
var"#threads#remain#" = true
421421
while var"#threads#remain#"
422422
VectorizationBase.assume(var"#thread#mask#" zero(var"#thread#mask#"))
@@ -428,7 +428,7 @@ function thread_one_loops_expr(
428428
$update_return_values
429429
var"#threads#remain#" = var"#thread#mask#" 0x00000000
430430
end
431-
CheapThreads.free_threads!(var"#torelease#")
431+
Polyester.free_threads!(var"#torelease#")
432432
$retexpr
433433
end
434434
Expr(:block, ls.preamble, q)
@@ -559,15 +559,15 @@ function thread_two_loops_expr(
559559
var"#loop#1#start#init#" = var"#iter#start#0#"
560560
var"##do#thread##" = var"#nrequest#" 0x00000000
561561
if var"##do#thread##"
562-
var"#threads#", var"#torelease#" = CheapThreads.request_threads(Threads.threadid(), var"#nrequest#")
562+
var"#threads#", var"#torelease#" = Polyester.request_threads(Threads.threadid(), var"#nrequest#")
563563
$iterdef1
564564
$iterdef2
565565
# @show var"#base#block#size#thread#0#", var"#block#rem#step#0#" var"#base#block#size#thread#1#", var"#block#rem#step#1#"
566566
var"#thread#launch#count#" = 0x00000000
567567
var"#thread#launch#count#0#" = 0x00000000
568568
var"#thread#launch#count#1#" = 0x00000000
569569
var"#thread#id#" = 0x00000000
570-
var"#thread#mask#" = CheapThreads.mask(var"#threads#")
570+
var"#thread#mask#" = Polyester.mask(var"#threads#")
571571
var"#threads#remain#" = true
572572
while var"#threads#remain#"
573573
VectorizationBase.assume(var"#thread#mask#" zero(var"#thread#mask#"))
@@ -604,16 +604,16 @@ function thread_two_loops_expr(
604604
var"#threads#remain#" = var"#thread#launch#count#" var"#nrequest#"
605605
end
606606
else# eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
607-
var"#torelease#" = zero(CheapThreads.worker_type())
608-
var"#threads#" = CheapThreads.UnsignedIteratorEarlyStop(var"#torelease#", 0x00000000)
607+
var"#torelease#" = zero(Polyester.worker_type())
608+
var"#threads#" = Polyester.UnsignedIteratorEarlyStop(var"#torelease#", 0x00000000)
609609
end
610610
# @show $lastboundexpr
611611
var"#avx#call#args#" = $avxcall_args
612612
$_avx_call_
613613
var"##do#thread##" || $retexpr
614614
# @show $retv
615615
var"#thread#id#" = 0x00000000
616-
var"#thread#mask#" = CheapThreads.mask(var"#threads#")
616+
var"#thread#mask#" = Polyester.mask(var"#threads#")
617617
var"#threads#remain#" = true
618618
while var"#threads#remain#"
619619
VectorizationBase.assume(var"#thread#mask#" zero(var"#thread#mask#"))
@@ -625,7 +625,7 @@ function thread_two_loops_expr(
625625
$update_return_values
626626
var"#threads#remain#" = var"#thread#mask#" 0x00000000
627627
end
628-
CheapThreads.free_threads!(var"#torelease#")
628+
Polyester.free_threads!(var"#torelease#")
629629
$retexpr
630630
end
631631
# @show

src/codegen/lowering.jl

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -218,10 +218,17 @@ function lower_unrolled_dynamic(ls::LoopSet, us::UnrollSpecification, n::Int, in
218218
nisunrolled = isunrolled1(us, n)
219219
nisvectorized = isvectorized(us, n)
220220
W = nisvectorized ? ls.vector_width : 1
221-
loopisstatic = isstaticloop(loop) & (!iszero(W))
222221
UFW = UF * W
223222
looplength = length(loop)
224-
if loopisstatic & (UFW > looplength)
223+
if W 0 & isknown(first(loop)) & isknown(step(loop))
224+
loopisstatic = isknown(last(loop))
225+
# something other than the default hint currently means an UpperBoundedInteger was passed as an argument
226+
loopisbounded = (looplength < UFW) & (loopisstatic | (gethint(last(loop)) 1024))
227+
else
228+
loopisstatic = false
229+
loopisbounded = false
230+
end
231+
if loopisstatic & loopisbounded
225232
UFWnew = cld(looplength, cld(looplength, UFW))
226233
UF = cld(UFWnew, W)
227234
UFW = UF * W
@@ -235,9 +242,11 @@ function lower_unrolled_dynamic(ls::LoopSet, us::UnrollSpecification, n::Int, in
235242
remfirst = loopisstatic & (!nisvectorized) & (UFt > 0) & !(unsigned(Ureduct) < unsigned(UF))
236243
tc = terminatecondition(ls, us, n, inclmask, remfirst ? 1 : UF)
237244
# usorig = ls.unrollspecification
238-
# tc = (usorig.u₁ == us.u₁) && (usorig.u₂ == us.u₂) && !loopisstatic && !inclmask && !ls.loadelimination ? expect(tc) : tc
239-
body = lower_block(ls, us, n, inclmask, UF)
240-
if loopisstatic
245+
# tc = (usorig.u₁ == us.u₁) && (usorig.u₂ == us.u₂) && !loopisstatic && !inclmask && !ls.loadelimination ? expect(tc) : tc
246+
# Don't need to create the body if loop is dynamic and bounded
247+
dynamicbounded = ((!loopisstatic) & loopisbounded)
248+
body = dynamicbounded ? tc : lower_block(ls, us, n, inclmask, UF)
249+
if loopisstatic
241250
iters = length(loop) ÷ UFW
242251
if (iters 1) || (iters*UF 16 && allinteriorunrolled(ls, us, n))# Let's set a limit on total unrolling
243252
q = Expr(:block)
@@ -254,16 +263,18 @@ function lower_unrolled_dynamic(ls::LoopSet, us::UnrollSpecification, n::Int, in
254263
remblock = init_remblock(loop, ls.lssm, n)#loopsym)
255264
# unroll_cleanup = Ureduct > 0 || (nisunrolled ? (u₂ > 1) : (u₁ > 1))
256265
# remblock = unroll_cleanup ? init_remblock(loop, ls.lssm, n)#loopsym) : Expr(:block)
257-
q = if unsigned(Ureduct) < unsigned(UF)
258-
# push!(body.args, Expr(:(||), tc, Expr(:break)))
259-
# Expr(:while, true, body)
260-
termcond = gensym(:maybeterm)
261-
push!(body.args, Expr(:(=), termcond, tc))
262-
Expr(:block, Expr(:(=), termcond, true), Expr(:while, termcond, body))
263-
else
264-
Expr(:while, tc, body)
265-
end
266-
end
266+
q = if loopisbounded
267+
Expr(:block)
268+
elseif unsigned(Ureduct) < unsigned(UF)
269+
# push!(body.args, Expr(:(||), tc, Expr(:break)))
270+
# Expr(:while, true, body)
271+
termcond = gensym(:maybeterm)
272+
push!(body.args, Expr(:(=), termcond, tc))
273+
Expr(:block, Expr(:(=), termcond, true), Expr(:while, termcond, body))
274+
else
275+
Expr(:while, tc, body)
276+
end
277+
end
267278
q = if unsigned(Ureduct) < unsigned(UF) # unsigned(-1) == typemax(UInt);
268279
add_cleanup = true
269280
if isone(Ureduct)
@@ -290,7 +301,11 @@ function lower_unrolled_dynamic(ls::LoopSet, us::UnrollSpecification, n::Int, in
290301
UF_cleanup = UF - Ureduct
291302
blockhead = :if
292303
end
293-
_q = Expr(:block, add_upper_outer_reductions(ls, q, Ureduct, UF, loop, nisvectorized))
304+
_q = if dynamicbounded
305+
initialize_outer_reductions!(q, ls, Ureduct); q
306+
else
307+
Expr(:block, add_upper_outer_reductions(ls, q, Ureduct, UF, loop, nisvectorized))
308+
end
294309
if add_cleanup
295310
cleanup_expr = Expr(blockhead)
296311
blockhead === :block || push!(cleanup_expr.args, terminatecondition(ls, us, n, inclmask, UF_cleanup))

src/condense_loopset.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ end
226226

227227
@inline zerorangestart(r::Base.OneTo) = CloseOpen(maybestaticlast(r))
228228
@inline zerorangestart(r::CloseOpen) = CloseOpen(length(r))
229+
@inline zerorangestart(r::CloseOpen{Zero}) = r
229230
@inline zerorangestart(r::AbstractUnitRange) = Zero():One():(maybestaticlast(r)-maybestaticfirst(r))
230231
@inline zerorangestart(r::AbstractRange) = Zero():static_step(r):(maybestaticlast(r)-maybestaticfirst(r))
231232
@inline zerorangestart(r::CartesianIndices) = CartesianIndices(map(zerorangestart, r.indices))

src/reconstruct_loopset.jl

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,25 @@
11
const NOpsType = Int#Union{Int,Vector{Int}}
22

3-
function Loop(ls::LoopSet, ex::Expr, sym::Symbol, f, s, l)
3+
struct UpperBoundedInteger{N, T<: Integer} <: Integer
4+
i::T
5+
end
6+
@inline UpperBoundedInteger{N}(i::T) where {N,T<:Integer} = UpperBoundedInteger{N,T}(i)
7+
@inline UpperBoundedInteger(i::T, ::StaticInt{N}) where {N,T<:Integer} = UpperBoundedInteger{N,T}(i)
8+
@inline Base.:(%)(a::UpperBoundedInteger, ::Type{T}) where {T<:Integer} = a.i % T
9+
Base.promote_rule(::Type{T}, ::Type{UpperBoundedInteger{N,S}}) where {N,T,S} = promote_rule(T,S)
10+
Base.promote_rule(::Type{UpperBoundedInteger{N,S}}, ::Type{T}) where {N,T,S} = promote_rule(S,T)
11+
Base.convert(::Type{T}, i::UpperBoundedInteger) where {T<:Number} = convert(T, i.i)
12+
Base.convert(::Type{UpperBoundedInteger{N,T}}, i::UpperBoundedInteger{N,T}) where {N,T<:Integer} = i
13+
Base.convert(::Type{Any}, i::UpperBoundedInteger) = i
14+
upper_bound(_) = typemax(Int)
15+
upper_bound(::Type{CloseOpen{T,UpperBoundedInteger{N,S}}}) where {T,N,S} = N - 1
16+
17+
@inline Base.last(r::CloseOpen{<:Integer,<:UpperBoundedInteger}) = getfield(getfield(r,:upper),:i) - One()
18+
@inline ArrayInterface.static_last(r::CloseOpen{<:Integer,<:UpperBoundedInteger}) = getfield(getfield(r,:upper),:i) - One()
19+
@inline Base.length(r::CloseOpen{<:Integer,<:UpperBoundedInteger}) = getfield(getfield(r,:upper),:i) - getfield(r,:start)
20+
@inline Base.length(r::CloseOpen{Zero,<:UpperBoundedInteger}) = getfield(getfield(r,:upper),:i)
21+
22+
function Loop(ls::LoopSet, ex::Expr, sym::Symbol, f, s, l, ub::Int)
423
if (f !== nothing) && (s !== nothing) && (l !== nothing)
524
return static_loop(sym, f, s, l)
625
end
@@ -11,32 +30,33 @@ function Loop(ls::LoopSet, ex::Expr, sym::Symbol, f, s, l)
1130
pushpreamble!(ls, Expr(:(=), lensym, Expr(:call, lv(:maybestaticlength), rangesym)))
1231
F = if f === nothing
1332
start = gensym(ssym*"_loopstart")
14-
pushpreamble!(ls, Expr(:(=), start, Expr(:call, lv(:first), rangesym)))
33+
pushpreamble!(ls, Expr(:(=), start, Expr(:call, %, Expr(:call, lv(:first), rangesym), Int)))
1534
MaybeKnown(start, 1)
1635
else
1736
MaybeKnown(f)
1837
end
1938
S = if s === nothing
2039
step = gensym(ssym*"_loopstep")
21-
pushpreamble!(ls, Expr(:(=), step, Expr(:call, lv(:step), rangesym)))
40+
pushpreamble!(ls, Expr(:(=), step, Expr(:call, %, Expr(:call, lv(:step), rangesym), Int)))
2241
MaybeKnown(step, 1)
2342
else
2443
MaybeKnown(s)
2544
end
2645
L = if l === nothing
2746
stop = gensym(ssym*"_loopstop")
28-
pushpreamble!(ls, Expr(:(=), stop, Expr(:call, lv(:last), rangesym)))
29-
MaybeKnown(stop, 1024)
47+
pushpreamble!(ls, Expr(:(=), stop, Expr(:call, %, Expr(:call, lv(:last), rangesym), Int)))
48+
MaybeKnown(stop, min(ub, 1024))
3049
else
3150
MaybeKnown(l)
3251
end
3352
loopiteratesatleastonce!(ls, Loop(sym, F, L, S, rangesym, lensym))
3453
end
3554
function Loop(ls::LoopSet, ex::Expr, sym::Symbol, ::Type{R}) where {R<:AbstractRange}
36-
f = ArrayInterface.known_first(R)
37-
s = ArrayInterface.known_step(R)
38-
l = ArrayInterface.known_last(R)
39-
Loop(ls, ex, sym, f, s, l)
55+
f = ArrayInterface.known_first(R)
56+
s = ArrayInterface.known_step(R)
57+
l = ArrayInterface.known_last(R)
58+
ub = upper_bound(R)
59+
Loop(ls, ex, sym, f, s, l, ub)
4060
end
4161

4262
function static_loop(sym::Symbol, L::Int, S::Int, U::Int)
@@ -683,7 +703,7 @@ Execute an `@avx` block. The block's code is represented via the arguments:
683703
@aggressive_constprop @generated function _avx_!(
684704
::Val{var"#UNROLL#"}, ::Val{var"#OPS#"}, ::Val{var"#ARF#"}, ::Val{var"#AM#"}, ::Val{var"#LPSYM#"}, ::Val{Tuple{var"#LB#",var"#V#"}}, var"#flattened#var#arguments#"::Vararg{Any,var"#num#vargs#"}
685705
) where {var"#UNROLL#", var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#", var"#LB#", var"#V#", var"#num#vargs#"}
686-
# 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
706+
1 + 1 # Irrelevant line you can comment out/in to force recompilation...
687707
ls = _avx_loopset(var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#", var"#LB#".parameters, var"#V#".parameters, var"#UNROLL#")
688708
pushfirst!(ls.preamble.args, :(var"#lv#tuple#args#" = reassemble_tuple(Tuple{var"#LB#",var"#V#"}, var"#flattened#var#arguments#")))
689709
# return @show avx_body(ls, var"#UNROLL#")

test/runtests.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ const START_TIME = time()
7373

7474
@time include("outer_reductions.jl")
7575

76+
@time include("upperboundedintegers.jl")
77+
7678
if VERSION v"1.6"
7779
@time include("quantum.jl")
7880
end

test/upperboundedintegers.jl

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
using OffsetArrays, Test
2+
@testset "UpperboundedIntegers" begin
3+
function ubsum(x)
4+
@assert firstindex(x) == 0
5+
r = LoopVectorization.CloseOpen(LoopVectorization.UpperBoundedInteger(length(x), StaticInt(15)))
6+
s = zero(eltype(x))
7+
@avx for i r
8+
s += x[i]
9+
end
10+
s
11+
end
12+
function ubdouble!(y,x)
13+
@assert firstindex(x) == 0
14+
r = LoopVectorization.CloseOpen(LoopVectorization.UpperBoundedInteger(length(x), StaticInt(15)))
15+
@avx for i r
16+
y[i] = 2*x[i]
17+
end
18+
y
19+
end
20+
for l 1:15
21+
x = OffsetVector(rand(l), -1)
22+
@test ubsum(x) sum(x)
23+
@test ubdouble!(similar(x), x) == x .* 2
24+
end
25+
for l 70:75
26+
x = OffsetVector(rand(l), -1)
27+
# @test ubsum(x) ≈ sum(@view(x[begin:14]))
28+
@test ubsum(x) ubsum(x)
29+
xs = similar(x)
30+
@test @view(ubdouble!(xs, x)[begin:14]) == @view(x[begin:14]) .* 2;
31+
@test xs[end] 2x[end]
32+
end
33+
end

0 commit comments

Comments
 (0)