Skip to content

Commit d7e1aad

Browse files
committed
Lower @tturbo overhead when not threading.
1 parent 172d26a commit d7e1aad

File tree

2 files changed

+111
-106
lines changed

2 files changed

+111
-106
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.27"
4+
version = "0.12.28"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/codegen/lower_threads.jl

Lines changed: 110 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -380,44 +380,47 @@ function thread_one_loops_expr(
380380
iterdef = define_block_size(threadedloop, ua.vloop, 0, ls.vector_width)
381381
q = quote
382382
$choose_nthread # UInt
383-
$define_len
384-
$define_num_unrolls
385-
var"#nthreads#" = Base.min(var"#nthreads#", var"#num#unrolls#thread#0#")
386-
var"#nrequest#" = vsub_nw((var"#nthreads#" % UInt32), 0x00000001)
387383
$loopstart
388-
var"##do#thread##" = var"#nrequest#" 0x00000000
384+
var"##do#thread##" = var"#nthreads#" > one(var"#nthreads#")
389385
if var"##do#thread##"
390-
var"#threads#", var"#torelease#" = Polyester.request_threads(Threads.threadid()%UInt32, var"#nrequest#")
391-
var"#thread#factor#0#" = var"#nthreads#"
392-
$iterdef
393-
var"#thread#launch#count#" = 0x00000000
394-
var"#thread#id#" = 0x00000000
395-
var"#thread#mask#" = Polyester.mask(var"#threads#")
396-
var"#threads#remain#" = true
397-
while var"#threads#remain#"
398-
VectorizationBase.assume(var"#thread#mask#" zero(var"#thread#mask#"))
399-
var"#trailzing#zeros#" = Base.trailing_zeros(var"#thread#mask#") % UInt32
400-
var"#nblock#size#thread#0#" = Core.ifelse(
401-
var"#thread#launch#count#" < (var"#nrem#thread#0#" % UInt32),
402-
vadd_nw(var"#base#block#size#thread#0#", var"#block#rem#step#0#"),
403-
var"#base#block#size#thread#0#"
404-
)
405-
var"#trailzing#zeros#" = vadd_nw(var"#trailzing#zeros#", 0x00000001)
406-
$iterstop
407-
var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#")
386+
$define_len
387+
$define_num_unrolls
388+
var"#nthreads#" = Base.min(var"#nthreads#", var"#num#unrolls#thread#0#")
389+
var"#nrequest#" = vsub_nw((var"#nthreads#" % UInt32), 0x00000001)
390+
var"##do#thread##" = var"#nrequest#" 0x00000000
391+
if var"##do#thread##"
392+
var"#threads#", var"#torelease#" = Polyester.request_threads(Threads.threadid()%UInt32, var"#nrequest#")
393+
var"#thread#factor#0#" = var"#nthreads#"
394+
$iterdef
395+
var"#thread#launch#count#" = 0x00000000
396+
var"#thread#id#" = 0x00000000
397+
var"#thread#mask#" = Polyester.mask(var"#threads#")
398+
var"#threads#remain#" = true
399+
while var"#threads#remain#"
400+
VectorizationBase.assume(var"#thread#mask#" zero(var"#thread#mask#"))
401+
var"#trailzing#zeros#" = Base.trailing_zeros(var"#thread#mask#") % UInt32
402+
var"#nblock#size#thread#0#" = Core.ifelse(
403+
var"#thread#launch#count#" < (var"#nrem#thread#0#" % UInt32),
404+
vadd_nw(var"#base#block#size#thread#0#", var"#block#rem#step#0#"),
405+
var"#base#block#size#thread#0#"
406+
)
407+
var"#trailzing#zeros#" = vadd_nw(var"#trailzing#zeros#", 0x00000001)
408+
$iterstop
409+
var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#")
408410

409-
var"##lbvargs#to_launch##" = ($loopboundexpr, var"#vargs#")
410-
avx_launch(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, StaticType{typeof(var"##lbvargs#to_launch##")}(), flatten_to_tuple(var"##lbvargs#to_launch##"), var"#thread#id#")
411+
var"##lbvargs#to_launch##" = ($loopboundexpr, var"#vargs#")
412+
avx_launch(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, StaticType{typeof(var"##lbvargs#to_launch##")}(), flatten_to_tuple(var"##lbvargs#to_launch##"), var"#thread#id#")
411413

412-
var"#thread#mask#" >>>= var"#trailzing#zeros#"
414+
var"#thread#mask#" >>>= var"#trailzing#zeros#"
413415

414-
var"#iter#start#0#" = var"#iter#stop#0#"
415-
var"#thread#launch#count#" = vadd_nw(var"#thread#launch#count#", 0x00000001)
416-
var"#threads#remain#" = var"#thread#launch#count#" var"#nrequest#"
416+
var"#iter#start#0#" = var"#iter#stop#0#"
417+
var"#thread#launch#count#" = vadd_nw(var"#thread#launch#count#", 0x00000001)
418+
var"#threads#remain#" = var"#thread#launch#count#" var"#nrequest#"
419+
end
420+
else# eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
421+
var"#torelease#" = zero(Polyester.worker_type())
422+
var"#threads#" = Polyester.UnsignedIteratorEarlyStop(var"#torelease#", 0x00000000)
417423
end
418-
else# eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
419-
var"#torelease#" = zero(Polyester.worker_type())
420-
var"#threads#" = Polyester.UnsignedIteratorEarlyStop(var"#torelease#", 0x00000000)
421424
end
422425
var"#avx#call#args#" = $avxcall_args
423426
$_turbo_call_
@@ -438,6 +441,7 @@ function thread_one_loops_expr(
438441
Polyester.free_threads!(var"#torelease#")
439442
$retexpr
440443
end
444+
# Expr(:block, Expr(:meta,:inline), ls.preamble, q)
441445
Expr(:block, ls.preamble, q)
442446
end
443447
function define_vthread_blocks(vloop, u₁loop, u₂loop, u₁, u₂, ntmax, tn)
@@ -534,83 +538,85 @@ function thread_two_loops_expr(
534538
$choose_nthread # UInt
535539
$loopstart1
536540
$loopstart2
537-
# if var"#nthreads#" ≤ 1
538-
# $_turbo_orig_
539-
# return $retexpr
540-
# end
541-
$define_len1
542-
$define_len2
543-
$define_num_unrolls1
544-
$define_num_unrolls2
545-
var"#unroll#prod#" = vmul_nw(var"#num#unrolls#thread#0#", var"#num#unrolls#thread#1#")
546-
if var"#nthreads#" var"#unroll#prod#"
547-
var"#nthreads#" = var"#unroll#prod#"
548-
var"#thread#factor#0#" = var"#num#unrolls#thread#0#"
549-
var"#thread#factor#1#" = var"#num#unrolls#thread#1#"
550-
else
551-
var"##thread#0##excess##" = var"#num#unrolls#thread#0#" var"#nthreads#"
552-
var"##thread#1##excess##" = var"#num#unrolls#thread#1#" var"#nthreads#"
553-
if var"##thread#0##excess##" & var"##thread#1##excess##"
554-
$blockdef
555-
elseif var"##thread#0##excess##" # var"#num#unrolls#thread#1#" is small but var"#num#unrolls#thread#0#" is not; we want to place a small one in front
556-
(var"#thread#factor#1#", var"#thread#factor#0#") = _choose_num_blocks(var"#num#unrolls#thread#1#", StaticInt{1}(), var"#nthreads#", $(staticexpr(ntmax % Int)))
557-
else # var"#num#unrolls#thread#0#" is small, and var"#num#unrolls#thread#1#" may or may not be
558-
(var"#thread#factor#0#", var"#thread#factor#1#") = _choose_num_blocks(var"#num#unrolls#thread#0#", StaticInt{1}(), var"#nthreads#", $(staticexpr(ntmax % Int)))
559-
end
560-
var"#num#unrolls#thread#1#"
561-
var"#thread#factor#0#" = min(var"#thread#factor#0#", var"#num#unrolls#thread#0#")
562-
var"#thread#factor#1#" = min(var"#thread#factor#1#", var"#num#unrolls#thread#1#")
563-
end
564-
# @show (var"#thread#factor#0#", var"#thread#factor#1#")
565-
var"#nrequest#" = vsub_nsw((var"#nthreads#" % UInt32), 0x00000001)
566-
var"#loop#1#start#init#" = var"#iter#start#0#"
567-
var"##do#thread##" = var"#nrequest#" 0x00000000
541+
var"##do#thread##" = var"#nthreads#" > one(var"#nthreads#")
568542
if var"##do#thread##"
569-
var"#threads#", var"#torelease#" = Polyester.request_threads(Threads.threadid(), var"#nrequest#")
570-
$iterdef1
571-
$iterdef2
572-
# @show var"#base#block#size#thread#0#", var"#block#rem#step#0#" var"#base#block#size#thread#1#", var"#block#rem#step#1#"
573-
var"#thread#launch#count#" = 0x00000000
574-
var"#thread#launch#count#0#" = 0x00000000
575-
var"#thread#launch#count#1#" = 0x00000000
576-
var"#thread#id#" = 0x00000000
577-
var"#thread#mask#" = Polyester.mask(var"#threads#")
578-
var"#threads#remain#" = true
579-
while var"#threads#remain#"
580-
VectorizationBase.assume(var"#thread#mask#" zero(var"#thread#mask#"))
581-
var"#trailzing#zeros#" = Base.trailing_zeros(var"#thread#mask#") % UInt32
582-
var"#nblock#size#thread#0#" = Core.ifelse(
583-
var"#thread#launch#count#0#" < (var"#nrem#thread#0#" % UInt32),
584-
vadd_nw(var"#base#block#size#thread#0#", var"#block#rem#step#0#"),
585-
var"#base#block#size#thread#0#"
586-
)
587-
var"#nblock#size#thread#1#" = Core.ifelse(
588-
var"#thread#launch#count#1#" < (var"#nrem#thread#1#" % UInt32),
589-
vadd_nw(var"#base#block#size#thread#1#", var"#block#rem#step#1#"),
590-
var"#base#block#size#thread#1#"
591-
)
592-
var"#trailzing#zeros#" = vadd_nw(var"#trailzing#zeros#", 0x00000001)
593-
$iterstop1
594-
$iterstop2
595-
var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#")
596-
# @show var"#thread#id#" $loopboundexpr
597-
var"##lbvargs#to_launch##" = ($loopboundexpr, var"#vargs#")
598-
avx_launch(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, StaticType{typeof(var"##lbvargs#to_launch##")}(), flatten_to_tuple(var"##lbvargs#to_launch##"), var"#thread#id#")
599-
var"#thread#mask#" >>>= var"#trailzing#zeros#"
543+
# if var"#nthreads#" ≤ 1
544+
# $_turbo_orig_
545+
# return $retexpr
546+
# end
547+
$define_len1
548+
$define_len2
549+
$define_num_unrolls1
550+
$define_num_unrolls2
551+
var"#unroll#prod#" = vmul_nw(var"#num#unrolls#thread#0#", var"#num#unrolls#thread#1#")
552+
if var"#nthreads#" var"#unroll#prod#"
553+
var"#nthreads#" = var"#unroll#prod#"
554+
var"#thread#factor#0#" = var"#num#unrolls#thread#0#"
555+
var"#thread#factor#1#" = var"#num#unrolls#thread#1#"
556+
else
557+
var"##thread#0##excess##" = var"#num#unrolls#thread#0#" var"#nthreads#"
558+
var"##thread#1##excess##" = var"#num#unrolls#thread#1#" var"#nthreads#"
559+
if var"##thread#0##excess##" & var"##thread#1##excess##"
560+
$blockdef
561+
elseif var"##thread#0##excess##" # var"#num#unrolls#thread#1#" is small but var"#num#unrolls#thread#0#" is not; we want to place a small one in front
562+
(var"#thread#factor#1#", var"#thread#factor#0#") = _choose_num_blocks(var"#num#unrolls#thread#1#", StaticInt{1}(), var"#nthreads#", $(staticexpr(ntmax % Int)))
563+
else # var"#num#unrolls#thread#0#" is small, and var"#num#unrolls#thread#1#" may or may not be
564+
(var"#thread#factor#0#", var"#thread#factor#1#") = _choose_num_blocks(var"#num#unrolls#thread#0#", StaticInt{1}(), var"#nthreads#", $(staticexpr(ntmax % Int)))
565+
end
566+
var"#thread#factor#0#" = min(var"#thread#factor#0#", var"#num#unrolls#thread#0#")
567+
var"#thread#factor#1#" = min(var"#thread#factor#1#", var"#num#unrolls#thread#1#")
568+
end
569+
# @show (var"#thread#factor#0#", var"#thread#factor#1#")
570+
var"#nrequest#" = vsub_nsw((var"#nthreads#" % UInt32), 0x00000001)
571+
var"#loop#1#start#init#" = var"#iter#start#0#"
572+
var"##do#thread##" = var"#nrequest#" 0x00000000
573+
if var"##do#thread##"
574+
var"#threads#", var"#torelease#" = Polyester.request_threads(Threads.threadid(), var"#nrequest#")
575+
$iterdef1
576+
$iterdef2
577+
# @show var"#base#block#size#thread#0#", var"#block#rem#step#0#" var"#base#block#size#thread#1#", var"#block#rem#step#1#"
578+
var"#thread#launch#count#" = 0x00000000
579+
var"#thread#launch#count#0#" = 0x00000000
580+
var"#thread#launch#count#1#" = 0x00000000
581+
var"#thread#id#" = 0x00000000
582+
var"#thread#mask#" = Polyester.mask(var"#threads#")
583+
var"#threads#remain#" = true
584+
while var"#threads#remain#"
585+
VectorizationBase.assume(var"#thread#mask#" zero(var"#thread#mask#"))
586+
var"#trailzing#zeros#" = Base.trailing_zeros(var"#thread#mask#") % UInt32
587+
var"#nblock#size#thread#0#" = Core.ifelse(
588+
var"#thread#launch#count#0#" < (var"#nrem#thread#0#" % UInt32),
589+
vadd_nw(var"#base#block#size#thread#0#", var"#block#rem#step#0#"),
590+
var"#base#block#size#thread#0#"
591+
)
592+
var"#nblock#size#thread#1#" = Core.ifelse(
593+
var"#thread#launch#count#1#" < (var"#nrem#thread#1#" % UInt32),
594+
vadd_nw(var"#base#block#size#thread#1#", var"#block#rem#step#1#"),
595+
var"#base#block#size#thread#1#"
596+
)
597+
var"#trailzing#zeros#" = vadd_nw(var"#trailzing#zeros#", 0x00000001)
598+
$iterstop1
599+
$iterstop2
600+
var"#thread#id#" = vadd_nw(var"#thread#id#", var"#trailzing#zeros#")
601+
# @show var"#thread#id#" $loopboundexpr
602+
var"##lbvargs#to_launch##" = ($loopboundexpr, var"#vargs#")
603+
avx_launch(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, StaticType{typeof(var"##lbvargs#to_launch##")}(), flatten_to_tuple(var"##lbvargs#to_launch##"), var"#thread#id#")
604+
var"#thread#mask#" >>>= var"#trailzing#zeros#"
600605

601-
var"##end#inner##" = var"#thread#launch#count#0#" == vsub_nw(var"#thread#factor#0#", 0x00000001)
602-
var"#thread#launch#count#0#" = Core.ifelse(var"##end#inner##", 0x00000000, vadd_nw(var"#thread#launch#count#0#", 0x00000001))
603-
var"#thread#launch#count#1#" = Core.ifelse(var"##end#inner##", var"#thread#launch#count#1#" + 0x00000001, var"#thread#launch#count#1#")
606+
var"##end#inner##" = var"#thread#launch#count#0#" == vsub_nw(var"#thread#factor#0#", 0x00000001)
607+
var"#thread#launch#count#0#" = Core.ifelse(var"##end#inner##", 0x00000000, vadd_nw(var"#thread#launch#count#0#", 0x00000001))
608+
var"#thread#launch#count#1#" = Core.ifelse(var"##end#inner##", var"#thread#launch#count#1#" + 0x00000001, var"#thread#launch#count#1#")
604609

605-
var"#iter#start#0#" = Core.ifelse(var"##end#inner##", var"#loop#1#start#init#", var"#iter#stop#0#")
606-
var"#iter#start#1#" = Core.ifelse(var"##end#inner##", var"#iter#stop#1#", var"#iter#start#1#")
610+
var"#iter#start#0#" = Core.ifelse(var"##end#inner##", var"#loop#1#start#init#", var"#iter#stop#0#")
611+
var"#iter#start#1#" = Core.ifelse(var"##end#inner##", var"#iter#stop#1#", var"#iter#start#1#")
607612

608-
var"#thread#launch#count#" = vadd_nw(var"#thread#launch#count#", 0x00000001)
609-
var"#threads#remain#" = var"#thread#launch#count#" var"#nrequest#"
613+
var"#thread#launch#count#" = vadd_nw(var"#thread#launch#count#", 0x00000001)
614+
var"#threads#remain#" = var"#thread#launch#count#" var"#nrequest#"
615+
end
616+
else# eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
617+
var"#torelease#" = zero(Polyester.worker_type())
618+
var"#threads#" = Polyester.UnsignedIteratorEarlyStop(var"#torelease#", 0x00000000)
610619
end
611-
else# eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
612-
var"#torelease#" = zero(Polyester.worker_type())
613-
var"#threads#" = Polyester.UnsignedIteratorEarlyStop(var"#torelease#", 0x00000000)
614620
end
615621
# @show $lastboundexpr
616622
var"#avx#call#args#" = $avxcall_args
@@ -633,7 +639,6 @@ function thread_two_loops_expr(
633639
Polyester.free_threads!(var"#torelease#")
634640
$retexpr
635641
end
636-
# @show
637642
# Expr(:block, Expr(:meta,:inline), ls.preamble, q)
638643
Expr(:block, ls.preamble, q)
639644
end

0 commit comments

Comments
 (0)