@@ -380,44 +380,47 @@ function thread_one_loops_expr(
380
380
iterdef = define_block_size (threadedloop, ua. vloop, 0 , ls. vector_width)
381
381
q = quote
382
382
$ choose_nthread # UInt
383
- $ define_len
384
- $ define_num_unrolls
385
- var"#nthreads#" = Base. min (var"#nthreads#" , var"#num#unrolls#thread#0#" )
386
- var"#nrequest#" = vsub_nw ((var"#nthreads#" % UInt32), 0x00000001 )
387
383
$ loopstart
388
- var"##do#thread##" = var"#nrequest #" ≠ 0x00000000
384
+ var"##do#thread##" = var"#nthreads #" > one ( var"#nthreads#" )
389
385
if var"##do#thread##"
390
- var"#threads#" , var"#torelease#" = Polyester. request_threads (Threads. threadid ()% UInt32, var"#nrequest#" )
391
- var"#thread#factor#0#" = var"#nthreads#"
392
- $ iterdef
393
- var"#thread#launch#count#" = 0x00000000
394
- var"#thread#id#" = 0x00000000
395
- var"#thread#mask#" = Polyester. mask (var"#threads#" )
396
- var"#threads#remain#" = true
397
- while var"#threads#remain#"
398
- VectorizationBase. assume (var"#thread#mask#" ≠ zero (var"#thread#mask#" ))
399
- var"#trailzing#zeros#" = Base. trailing_zeros (var"#thread#mask#" ) % UInt32
400
- var"#nblock#size#thread#0#" = Core. ifelse (
401
- var"#thread#launch#count#" < (var"#nrem#thread#0#" % UInt32),
402
- vadd_nw (var"#base#block#size#thread#0#" , var"#block#rem#step#0#" ),
403
- var"#base#block#size#thread#0#"
404
- )
405
- var"#trailzing#zeros#" = vadd_nw (var"#trailzing#zeros#" , 0x00000001 )
406
- $ iterstop
407
- var"#thread#id#" = vadd_nw (var"#thread#id#" , var"#trailzing#zeros#" )
386
+ $ define_len
387
+ $ define_num_unrolls
388
+ var"#nthreads#" = Base. min (var"#nthreads#" , var"#num#unrolls#thread#0#" )
389
+ var"#nrequest#" = vsub_nw ((var"#nthreads#" % UInt32), 0x00000001 )
390
+ var"##do#thread##" = var"#nrequest#" ≠ 0x00000000
391
+ if var"##do#thread##"
392
+ var"#threads#" , var"#torelease#" = Polyester. request_threads (Threads. threadid ()% UInt32, var"#nrequest#" )
393
+ var"#thread#factor#0#" = var"#nthreads#"
394
+ $ iterdef
395
+ var"#thread#launch#count#" = 0x00000000
396
+ var"#thread#id#" = 0x00000000
397
+ var"#thread#mask#" = Polyester. mask (var"#threads#" )
398
+ var"#threads#remain#" = true
399
+ while var"#threads#remain#"
400
+ VectorizationBase. assume (var"#thread#mask#" ≠ zero (var"#thread#mask#" ))
401
+ var"#trailzing#zeros#" = Base. trailing_zeros (var"#thread#mask#" ) % UInt32
402
+ var"#nblock#size#thread#0#" = Core. ifelse (
403
+ var"#thread#launch#count#" < (var"#nrem#thread#0#" % UInt32),
404
+ vadd_nw (var"#base#block#size#thread#0#" , var"#block#rem#step#0#" ),
405
+ var"#base#block#size#thread#0#"
406
+ )
407
+ var"#trailzing#zeros#" = vadd_nw (var"#trailzing#zeros#" , 0x00000001 )
408
+ $ iterstop
409
+ var"#thread#id#" = vadd_nw (var"#thread#id#" , var"#trailzing#zeros#" )
408
410
409
- var"##lbvargs#to_launch##" = ($ loopboundexpr, var"#vargs#" )
410
- avx_launch (Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM, StaticType {typeof(var"##lbvargs#to_launch##")} (), flatten_to_tuple (var"##lbvargs#to_launch##" ), var"#thread#id#" )
411
+ var"##lbvargs#to_launch##" = ($ loopboundexpr, var"#vargs#" )
412
+ avx_launch (Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM, StaticType {typeof(var"##lbvargs#to_launch##")} (), flatten_to_tuple (var"##lbvargs#to_launch##" ), var"#thread#id#" )
411
413
412
- var"#thread#mask#" >>>= var"#trailzing#zeros#"
414
+ var"#thread#mask#" >>>= var"#trailzing#zeros#"
413
415
414
- var"#iter#start#0#" = var"#iter#stop#0#"
415
- var"#thread#launch#count#" = vadd_nw (var"#thread#launch#count#" , 0x00000001 )
416
- var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#"
416
+ var"#iter#start#0#" = var"#iter#stop#0#"
417
+ var"#thread#launch#count#" = vadd_nw (var"#thread#launch#count#" , 0x00000001 )
418
+ var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#"
419
+ end
420
+ else # eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
421
+ var"#torelease#" = zero (Polyester. worker_type ())
422
+ var"#threads#" = Polyester. UnsignedIteratorEarlyStop (var"#torelease#" , 0x00000000 )
417
423
end
418
- else # eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
419
- var"#torelease#" = zero (Polyester. worker_type ())
420
- var"#threads#" = Polyester. UnsignedIteratorEarlyStop (var"#torelease#" , 0x00000000 )
421
424
end
422
425
var"#avx#call#args#" = $ avxcall_args
423
426
$ _turbo_call_
@@ -438,6 +441,7 @@ function thread_one_loops_expr(
438
441
Polyester. free_threads! (var"#torelease#" )
439
442
$ retexpr
440
443
end
444
+ # Expr(:block, Expr(:meta,:inline), ls.preamble, q)
441
445
Expr (:block , ls. preamble, q)
442
446
end
443
447
function define_vthread_blocks (vloop, u₁loop, u₂loop, u₁, u₂, ntmax, tn)
@@ -534,83 +538,85 @@ function thread_two_loops_expr(
534
538
$ choose_nthread # UInt
535
539
$ loopstart1
536
540
$ loopstart2
537
- # if var"#nthreads#" ≤ 1
538
- # $_turbo_orig_
539
- # return $retexpr
540
- # end
541
- $ define_len1
542
- $ define_len2
543
- $ define_num_unrolls1
544
- $ define_num_unrolls2
545
- var"#unroll#prod#" = vmul_nw (var"#num#unrolls#thread#0#" , var"#num#unrolls#thread#1#" )
546
- if var"#nthreads#" ≥ var"#unroll#prod#"
547
- var"#nthreads#" = var"#unroll#prod#"
548
- var"#thread#factor#0#" = var"#num#unrolls#thread#0#"
549
- var"#thread#factor#1#" = var"#num#unrolls#thread#1#"
550
- else
551
- var"##thread#0##excess##" = var"#num#unrolls#thread#0#" ≥ var"#nthreads#"
552
- var"##thread#1##excess##" = var"#num#unrolls#thread#1#" ≥ var"#nthreads#"
553
- if var"##thread#0##excess##" & var"##thread#1##excess##"
554
- $ blockdef
555
- elseif var"##thread#0##excess##" # var"#num#unrolls#thread#1#" is small but var"#num#unrolls#thread#0#" is not; we want to place a small one in front
556
- (var"#thread#factor#1#" , var"#thread#factor#0#" ) = _choose_num_blocks (var"#num#unrolls#thread#1#" , StaticInt {1} (), var"#nthreads#" , $ (staticexpr (ntmax % Int)))
557
- else # var"#num#unrolls#thread#0#" is small, and var"#num#unrolls#thread#1#" may or may not be
558
- (var"#thread#factor#0#" , var"#thread#factor#1#" ) = _choose_num_blocks (var"#num#unrolls#thread#0#" , StaticInt {1} (), var"#nthreads#" , $ (staticexpr (ntmax % Int)))
559
- end
560
- var"#num#unrolls#thread#1#"
561
- var"#thread#factor#0#" = min (var"#thread#factor#0#" , var"#num#unrolls#thread#0#" )
562
- var"#thread#factor#1#" = min (var"#thread#factor#1#" , var"#num#unrolls#thread#1#" )
563
- end
564
- # @show (var"#thread#factor#0#", var"#thread#factor#1#")
565
- var"#nrequest#" = vsub_nsw ((var"#nthreads#" % UInt32), 0x00000001 )
566
- var"#loop#1#start#init#" = var"#iter#start#0#"
567
- var"##do#thread##" = var"#nrequest#" ≠ 0x00000000
541
+ var"##do#thread##" = var"#nthreads#" > one (var"#nthreads#" )
568
542
if var"##do#thread##"
569
- var"#threads#" , var"#torelease#" = Polyester. request_threads (Threads. threadid (), var"#nrequest#" )
570
- $ iterdef1
571
- $ iterdef2
572
- # @show var"#base#block#size#thread#0#", var"#block#rem#step#0#" var"#base#block#size#thread#1#", var"#block#rem#step#1#"
573
- var"#thread#launch#count#" = 0x00000000
574
- var"#thread#launch#count#0#" = 0x00000000
575
- var"#thread#launch#count#1#" = 0x00000000
576
- var"#thread#id#" = 0x00000000
577
- var"#thread#mask#" = Polyester. mask (var"#threads#" )
578
- var"#threads#remain#" = true
579
- while var"#threads#remain#"
580
- VectorizationBase. assume (var"#thread#mask#" ≠ zero (var"#thread#mask#" ))
581
- var"#trailzing#zeros#" = Base. trailing_zeros (var"#thread#mask#" ) % UInt32
582
- var"#nblock#size#thread#0#" = Core. ifelse (
583
- var"#thread#launch#count#0#" < (var"#nrem#thread#0#" % UInt32),
584
- vadd_nw (var"#base#block#size#thread#0#" , var"#block#rem#step#0#" ),
585
- var"#base#block#size#thread#0#"
586
- )
587
- var"#nblock#size#thread#1#" = Core. ifelse (
588
- var"#thread#launch#count#1#" < (var"#nrem#thread#1#" % UInt32),
589
- vadd_nw (var"#base#block#size#thread#1#" , var"#block#rem#step#1#" ),
590
- var"#base#block#size#thread#1#"
591
- )
592
- var"#trailzing#zeros#" = vadd_nw (var"#trailzing#zeros#" , 0x00000001 )
593
- $ iterstop1
594
- $ iterstop2
595
- var"#thread#id#" = vadd_nw (var"#thread#id#" , var"#trailzing#zeros#" )
596
- # @show var"#thread#id#" $loopboundexpr
597
- var"##lbvargs#to_launch##" = ($ loopboundexpr, var"#vargs#" )
598
- avx_launch (Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM, StaticType {typeof(var"##lbvargs#to_launch##")} (), flatten_to_tuple (var"##lbvargs#to_launch##" ), var"#thread#id#" )
599
- var"#thread#mask#" >>>= var"#trailzing#zeros#"
543
+ # if var"#nthreads#" ≤ 1
544
+ # $_turbo_orig_
545
+ # return $retexpr
546
+ # end
547
+ $ define_len1
548
+ $ define_len2
549
+ $ define_num_unrolls1
550
+ $ define_num_unrolls2
551
+ var"#unroll#prod#" = vmul_nw (var"#num#unrolls#thread#0#" , var"#num#unrolls#thread#1#" )
552
+ if var"#nthreads#" ≥ var"#unroll#prod#"
553
+ var"#nthreads#" = var"#unroll#prod#"
554
+ var"#thread#factor#0#" = var"#num#unrolls#thread#0#"
555
+ var"#thread#factor#1#" = var"#num#unrolls#thread#1#"
556
+ else
557
+ var"##thread#0##excess##" = var"#num#unrolls#thread#0#" ≥ var"#nthreads#"
558
+ var"##thread#1##excess##" = var"#num#unrolls#thread#1#" ≥ var"#nthreads#"
559
+ if var"##thread#0##excess##" & var"##thread#1##excess##"
560
+ $ blockdef
561
+ elseif var"##thread#0##excess##" # var"#num#unrolls#thread#1#" is small but var"#num#unrolls#thread#0#" is not; we want to place a small one in front
562
+ (var"#thread#factor#1#" , var"#thread#factor#0#" ) = _choose_num_blocks (var"#num#unrolls#thread#1#" , StaticInt {1} (), var"#nthreads#" , $ (staticexpr (ntmax % Int)))
563
+ else # var"#num#unrolls#thread#0#" is small, and var"#num#unrolls#thread#1#" may or may not be
564
+ (var"#thread#factor#0#" , var"#thread#factor#1#" ) = _choose_num_blocks (var"#num#unrolls#thread#0#" , StaticInt {1} (), var"#nthreads#" , $ (staticexpr (ntmax % Int)))
565
+ end
566
+ var"#thread#factor#0#" = min (var"#thread#factor#0#" , var"#num#unrolls#thread#0#" )
567
+ var"#thread#factor#1#" = min (var"#thread#factor#1#" , var"#num#unrolls#thread#1#" )
568
+ end
569
+ # @show (var"#thread#factor#0#", var"#thread#factor#1#")
570
+ var"#nrequest#" = vsub_nsw ((var"#nthreads#" % UInt32), 0x00000001 )
571
+ var"#loop#1#start#init#" = var"#iter#start#0#"
572
+ var"##do#thread##" = var"#nrequest#" ≠ 0x00000000
573
+ if var"##do#thread##"
574
+ var"#threads#" , var"#torelease#" = Polyester. request_threads (Threads. threadid (), var"#nrequest#" )
575
+ $ iterdef1
576
+ $ iterdef2
577
+ # @show var"#base#block#size#thread#0#", var"#block#rem#step#0#" var"#base#block#size#thread#1#", var"#block#rem#step#1#"
578
+ var"#thread#launch#count#" = 0x00000000
579
+ var"#thread#launch#count#0#" = 0x00000000
580
+ var"#thread#launch#count#1#" = 0x00000000
581
+ var"#thread#id#" = 0x00000000
582
+ var"#thread#mask#" = Polyester. mask (var"#threads#" )
583
+ var"#threads#remain#" = true
584
+ while var"#threads#remain#"
585
+ VectorizationBase. assume (var"#thread#mask#" ≠ zero (var"#thread#mask#" ))
586
+ var"#trailzing#zeros#" = Base. trailing_zeros (var"#thread#mask#" ) % UInt32
587
+ var"#nblock#size#thread#0#" = Core. ifelse (
588
+ var"#thread#launch#count#0#" < (var"#nrem#thread#0#" % UInt32),
589
+ vadd_nw (var"#base#block#size#thread#0#" , var"#block#rem#step#0#" ),
590
+ var"#base#block#size#thread#0#"
591
+ )
592
+ var"#nblock#size#thread#1#" = Core. ifelse (
593
+ var"#thread#launch#count#1#" < (var"#nrem#thread#1#" % UInt32),
594
+ vadd_nw (var"#base#block#size#thread#1#" , var"#block#rem#step#1#" ),
595
+ var"#base#block#size#thread#1#"
596
+ )
597
+ var"#trailzing#zeros#" = vadd_nw (var"#trailzing#zeros#" , 0x00000001 )
598
+ $ iterstop1
599
+ $ iterstop2
600
+ var"#thread#id#" = vadd_nw (var"#thread#id#" , var"#trailzing#zeros#" )
601
+ # @show var"#thread#id#" $loopboundexpr
602
+ var"##lbvargs#to_launch##" = ($ loopboundexpr, var"#vargs#" )
603
+ avx_launch (Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM, StaticType {typeof(var"##lbvargs#to_launch##")} (), flatten_to_tuple (var"##lbvargs#to_launch##" ), var"#thread#id#" )
604
+ var"#thread#mask#" >>>= var"#trailzing#zeros#"
600
605
601
- var"##end#inner##" = var"#thread#launch#count#0#" == vsub_nw (var"#thread#factor#0#" , 0x00000001 )
602
- var"#thread#launch#count#0#" = Core. ifelse (var"##end#inner##" , 0x00000000 , vadd_nw (var"#thread#launch#count#0#" , 0x00000001 ))
603
- var"#thread#launch#count#1#" = Core. ifelse (var"##end#inner##" , var"#thread#launch#count#1#" + 0x00000001 , var"#thread#launch#count#1#" )
606
+ var"##end#inner##" = var"#thread#launch#count#0#" == vsub_nw (var"#thread#factor#0#" , 0x00000001 )
607
+ var"#thread#launch#count#0#" = Core. ifelse (var"##end#inner##" , 0x00000000 , vadd_nw (var"#thread#launch#count#0#" , 0x00000001 ))
608
+ var"#thread#launch#count#1#" = Core. ifelse (var"##end#inner##" , var"#thread#launch#count#1#" + 0x00000001 , var"#thread#launch#count#1#" )
604
609
605
- var"#iter#start#0#" = Core. ifelse (var"##end#inner##" , var"#loop#1#start#init#" , var"#iter#stop#0#" )
606
- var"#iter#start#1#" = Core. ifelse (var"##end#inner##" , var"#iter#stop#1#" , var"#iter#start#1#" )
610
+ var"#iter#start#0#" = Core. ifelse (var"##end#inner##" , var"#loop#1#start#init#" , var"#iter#stop#0#" )
611
+ var"#iter#start#1#" = Core. ifelse (var"##end#inner##" , var"#iter#stop#1#" , var"#iter#start#1#" )
607
612
608
- var"#thread#launch#count#" = vadd_nw (var"#thread#launch#count#" , 0x00000001 )
609
- var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#"
613
+ var"#thread#launch#count#" = vadd_nw (var"#thread#launch#count#" , 0x00000001 )
614
+ var"#threads#remain#" = var"#thread#launch#count#" ≠ var"#nrequest#"
615
+ end
616
+ else # eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
617
+ var"#torelease#" = zero (Polyester. worker_type ())
618
+ var"#threads#" = Polyester. UnsignedIteratorEarlyStop (var"#torelease#" , 0x00000000 )
610
619
end
611
- else # eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
612
- var"#torelease#" = zero (Polyester. worker_type ())
613
- var"#threads#" = Polyester. UnsignedIteratorEarlyStop (var"#torelease#" , 0x00000000 )
614
620
end
615
621
# @show $lastboundexpr
616
622
var"#avx#call#args#" = $ avxcall_args
@@ -633,7 +639,6 @@ function thread_two_loops_expr(
633
639
Polyester. free_threads! (var"#torelease#" )
634
640
$ retexpr
635
641
end
636
- # @show
637
642
# Expr(:block, Expr(:meta,:inline), ls.preamble, q)
638
643
Expr (:block , ls. preamble, q)
639
644
end
0 commit comments