@@ -349,15 +349,15 @@ function thread_one_loops_expr(
349
349
loop_boundary! (lastboundexpr, loop)
350
350
end
351
351
end
352
- _avx_call_core_ = :(_avx_! (Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM, ($ lastboundexpr, var"#vargs#" )))
353
- _avx_call_ = _avx_call_core_
352
+ _avx_call_ = :(_avx_! (Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM, ($ lastboundexpr, var"#vargs#" )))
354
353
update_return_values = if length (ls. outer_reductions) > 0
355
354
retv = loopset_return_value (ls, Val (false ))
356
355
_avx_call_ = Expr (:(= ), retv, _avx_call_)
357
356
outer_reduct_combine_expressions (ls, retv)
358
357
else
359
358
nothing
360
359
end
360
+ retexpr = length (ls. outer_reductions) > 0 ? :(return $ retv) : :(return nothing )
361
361
# @unpack u₁loop, u₂loop, vloop, u₁, u₂max = ua
362
362
iterdef = define_block_size (threadedloop, ua. vloop, 0 , ls. vector_width[])
363
363
q = quote
@@ -367,37 +367,43 @@ function thread_one_loops_expr(
367
367
var"#nthreads#" = Base. min (var"#nthreads#" , var"#num#unrolls#thread#0#" )
368
368
var"#nrequest#" = (var"#nthreads#" % UInt32) - 0x00000001
369
369
$ loopstart
370
- var"#nrequest#" == 0x00000000 && return $ _avx_call_core_
371
- var"#threads#" , var"#torelease#" = CheapThreads. request_threads (Threads. threadid ()% UInt32, var"#nrequest#" )
372
- var"#thread#factor#0#" = var"#nthreads#"
373
- $ iterdef
374
- var"#thread#launch#count#" = 0x00000000
375
- var"#thread#id#" = 0x00000000
376
- var"#thread#mask#" = CheapThreads. mask (var"#threads#" )
377
- var"#threads#remain#" = true
378
- while var"#threads#remain#"
379
- VectorizationBase. assume (var"#thread#mask#" ≠ zero (var"#thread#mask#" ))
380
- var"#trailzing#zeros#" = Base. trailing_zeros (var"#thread#mask#" ) % UInt32
381
- var"#nblock#size#thread#0#" = Core. ifelse (
382
- var"#thread#launch#count#" < (var"#nrem#thread#0#" % UInt32),
383
- var"#base#block#size#thread#0#" + var"#block#rem#step#0#" ,
384
- var"#base#block#size#thread#0#"
385
- )
386
- var"#trailzing#zeros#" += 0x00000001
387
- $ iterstop
388
- var"#thread#id#" += var"#trailzing#zeros#"
370
+ var"##do#thread##" = var"#nrequest#" ≠ 0x00000000
371
+ if var"##do#thread##"
372
+ var"#threads#" , var"#torelease#" = CheapThreads. request_threads (Threads. threadid ()% UInt32, var"#nrequest#" )
373
+ var"#thread#factor#0#" = var"#nthreads#"
374
+ $ iterdef
375
+ var"#thread#launch#count#" = 0x00000000
376
+ var"#thread#id#" = 0x00000000
377
+ var"#thread#mask#" = CheapThreads. mask (var"#threads#" )
378
+ var"#threads#remain#" = true
379
+ while var"#threads#remain#"
380
+ VectorizationBase. assume (var"#thread#mask#" ≠ zero (var"#thread#mask#" ))
381
+ var"#trailzing#zeros#" = Base. trailing_zeros (var"#thread#mask#" ) % UInt32
382
+ var"#nblock#size#thread#0#" = Core. ifelse (
383
+ var"#thread#launch#count#" < (var"#nrem#thread#0#" % UInt32),
384
+ var"#base#block#size#thread#0#" + var"#block#rem#step#0#" ,
385
+ var"#base#block#size#thread#0#"
386
+ )
387
+ var"#trailzing#zeros#" += 0x00000001
388
+ $ iterstop
389
+ var"#thread#id#" += var"#trailzing#zeros#"
389
390
390
- avx_launch (
391
- Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM,
392
- $ loopboundexpr, var"#vargs#" , var"#thread#id#"
393
- )
391
+ avx_launch (
392
+ Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM,
393
+ $ loopboundexpr, var"#vargs#" , var"#thread#id#"
394
+ )
394
395
395
- var"#thread#mask#" >>>= var"#trailzing#zeros#"
396
+ var"#thread#mask#" >>>= var"#trailzing#zeros#"
396
397
397
- var"#iter#start#0#" = var"#iter#stop#0#"
398
- var"#threads#remain#" = (var"#thread#launch#count#" += 0x00000001 ) ≠ var"#nrequest#"
398
+ var"#iter#start#0#" = var"#iter#stop#0#"
399
+ var"#threads#remain#" = (var"#thread#launch#count#" += 0x00000001 ) ≠ var"#nrequest#"
400
+ end
401
+ else # eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
402
+ var"#torelease#" = zero (CheapThreads. worker_type ())
403
+ var"#threads#" = CheapThreads. UnsignedIteratorEarlyStop (var"#torelease#" , 0x00000000 )
399
404
end
400
405
$ _avx_call_
406
+ var"##do#thread##" || $ retexpr
401
407
var"#thread#id#" = 0x00000000
402
408
var"#thread#mask#" = CheapThreads. mask (var"#threads#" )
403
409
var"#threads#remain#" = true
@@ -413,8 +419,8 @@ function thread_one_loops_expr(
413
419
var"#threads#remain#" = var"#thread#mask#" ≠ 0x00000000
414
420
end
415
421
CheapThreads. free_threads! (var"#torelease#" )
422
+ $ retexpr
416
423
end
417
- length (ls. outer_reductions) > 0 ? push! (q. args, retv) : push! (q. args, nothing )
418
424
Expr (:block , ls. preamble, q)
419
425
end
420
426
function define_vthread_blocks (vloop, u₁loop, u₂loop, u₁, u₂, ntmax, tn)
@@ -484,8 +490,7 @@ function thread_two_loops_expr(
484
490
loop_boundary! (lastboundexpr, loop)
485
491
end
486
492
end
487
- _avx_call_core_ = :(_avx_! (Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM, ($ lastboundexpr, var"#vargs#" )))
488
- _avx_call_ = _avx_call_core_
493
+ _avx_call_ = :(_avx_! (Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM, ($ lastboundexpr, var"#vargs#" )))
489
494
update_return_values = if length (ls. outer_reductions) > 0
490
495
retv = loopset_return_value (ls, Val (false ))
491
496
_avx_call_ = Expr (:(= ), retv, _avx_call_)
@@ -496,6 +501,7 @@ function thread_two_loops_expr(
496
501
blockdef = define_thread_blocks (threadedloop1, threadedloop2, vloop, u₁loop, u₂loop, u₁, u₂, ntmax)
497
502
iterdef1 = define_block_size (threadedloop1, vloop, 0 , ls. vector_width[])
498
503
iterdef2 = define_block_size (threadedloop2, vloop, 1 , ls. vector_width[])
504
+ retexpr = length (ls. outer_reductions) > 0 ? :(return $ retv) : :(return nothing )
499
505
q = quote
500
506
$ choose_nthread # UInt
501
507
$ define_len1
@@ -515,54 +521,59 @@ function thread_two_loops_expr(
515
521
$ loopstart1
516
522
var"#loop#1#start#init#" = var"#iter#start#0#"
517
523
$ loopstart2
518
- var"#nrequest#" == 0x00000000 && return $ _avx_call_core_
519
- var"#threads#" , var"#torelease#" = CheapThreads. request_threads (Threads. threadid (), var"#nrequest#" )
524
+ var"##do#thread##" = var"#nrequest#" ≠ 0x00000000
525
+ if var"##do#thread##"
526
+ var"#threads#" , var"#torelease#" = CheapThreads. request_threads (Threads. threadid (), var"#nrequest#" )
527
+ $ iterdef1
528
+ $ iterdef2
529
+ # @show var"#base#block#size#thread#0#", var"#block#rem#step#0#" var"#base#block#size#thread#1#", var"#block#rem#step#1#"
530
+ var"#thread#launch#count#" = 0x00000000
531
+ var"#thread#launch#count#0#" = 0x00000000
532
+ var"#thread#launch#count#1#" = 0x00000000
533
+ var"#thread#id#" = 0x00000000
534
+ var"#thread#mask#" = CheapThreads. mask (var"#threads#" )
535
+ var"#threads#remain#" = true
536
+ while var"#threads#remain#"
537
+ VectorizationBase. assume (var"#thread#mask#" ≠ zero (var"#thread#mask#" ))
538
+ var"#trailzing#zeros#" = Base. trailing_zeros (var"#thread#mask#" ) % UInt32
539
+ var"#nblock#size#thread#0#" = Core. ifelse (
540
+ var"#thread#launch#count#0#" < (var"#nrem#thread#0#" % UInt32),
541
+ var"#base#block#size#thread#0#" + var"#block#rem#step#0#" ,
542
+ var"#base#block#size#thread#0#"
543
+ )
544
+ var"#nblock#size#thread#1#" = Core. ifelse (
545
+ var"#thread#launch#count#1#" < (var"#nrem#thread#1#" % UInt32),
546
+ var"#base#block#size#thread#1#" + var"#block#rem#step#1#" ,
547
+ var"#base#block#size#thread#1#"
548
+ )
549
+ var"#trailzing#zeros#" += 0x00000001
550
+ $ iterstop1
551
+ $ iterstop2
552
+ var"#thread#id#" += var"#trailzing#zeros#"
553
+ # @show var"#thread#id#" $loopboundexpr
554
+ avx_launch (
555
+ Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM,
556
+ $ loopboundexpr, var"#vargs#" , var"#thread#id#"
557
+ )
520
558
521
- $ iterdef1
522
- $ iterdef2
523
- # @show var"#base#block#size#thread#0#", var"#block#rem#step#0#" var"#base#block#size#thread#1#", var"#block#rem#step#1#"
524
- var"#thread#launch#count#" = 0x00000000
525
- var"#thread#launch#count#0#" = 0x00000000
526
- var"#thread#launch#count#1#" = 0x00000000
527
- var"#thread#id#" = 0x00000000
528
- var"#thread#mask#" = CheapThreads. mask (var"#threads#" )
529
- var"#threads#remain#" = true
530
- while var"#threads#remain#"
531
- VectorizationBase. assume (var"#thread#mask#" ≠ zero (var"#thread#mask#" ))
532
- var"#trailzing#zeros#" = Base. trailing_zeros (var"#thread#mask#" ) % UInt32
533
- var"#nblock#size#thread#0#" = Core. ifelse (
534
- var"#thread#launch#count#0#" < (var"#nrem#thread#0#" % UInt32),
535
- var"#base#block#size#thread#0#" + var"#block#rem#step#0#" ,
536
- var"#base#block#size#thread#0#"
537
- )
538
- var"#nblock#size#thread#1#" = Core. ifelse (
539
- var"#thread#launch#count#1#" < (var"#nrem#thread#1#" % UInt32),
540
- var"#base#block#size#thread#1#" + var"#block#rem#step#1#" ,
541
- var"#base#block#size#thread#1#"
542
- )
543
- var"#trailzing#zeros#" += 0x00000001
544
- $ iterstop1
545
- $ iterstop2
546
- var"#thread#id#" += var"#trailzing#zeros#"
547
- # @show var"#thread#id#" $loopboundexpr
548
- avx_launch (
549
- Val {$UNROLL} (), $ OPS, $ ARF, $ AM, $ LPSYM,
550
- $ loopboundexpr, var"#vargs#" , var"#thread#id#"
551
- )
559
+ var"#thread#mask#" >>>= var"#trailzing#zeros#"
552
560
553
- var"#thread#mask#" >>>= var"#trailzing#zeros#"
561
+ var"##end#inner##" = var"#thread#launch#count#0#" == (var"#thread#factor#0#" - 0x00000001 )
562
+ var"#thread#launch#count#0#" = Core. ifelse (var"##end#inner##" , 0x00000000 , var"#thread#launch#count#0#" + 0x00000001 )
563
+ var"#thread#launch#count#1#" = Core. ifelse (var"##end#inner##" , var"#thread#launch#count#1#" + 0x00000001 , var"#thread#launch#count#1#" )
554
564
555
- var"##end#inner##" = var"#thread#launch#count#0#" == (var"#thread#factor#0#" - 0x00000001 )
556
- var"#thread#launch#count#0#" = Core. ifelse (var"##end#inner##" , 0x00000000 , var"#thread#launch#count#0#" + 0x00000001 )
557
- var"#thread#launch#count#1#" = Core. ifelse (var"##end#inner##" , var"#thread#launch#count#1#" + 0x00000001 , var"#thread#launch#count#1#" )
565
+ var"#iter#start#0#" = Core. ifelse (var"##end#inner##" , var"#loop#1#start#init#" , var"#iter#stop#0#" )
566
+ var"#iter#start#1#" = Core. ifelse (var"##end#inner##" , var"#iter#stop#1#" , var"#iter#start#1#" )
558
567
559
- var"#iter#start#0#" = Core. ifelse (var"##end#inner##" , var"#loop#1#start#init#" , var"#iter#stop#0#" )
560
- var"#iter#start#1#" = Core. ifelse (var"##end#inner##" , var"#iter#stop#1#" , var"#iter#start#1#" )
561
-
562
- var"#threads#remain#" = (var"#thread#launch#count#" += 0x00000001 ) ≠ var"#nrequest#"
568
+ var"#threads#remain#" = (var"#thread#launch#count#" += 0x00000001 ) ≠ var"#nrequest#"
569
+ end
570
+ else # eliminate undef var errors that the compiler should be able to figure out are unreachable, but doesn't
571
+ var"#torelease#" = zero (CheapThreads. worker_type ())
572
+ var"#threads#" = CheapThreads. UnsignedIteratorEarlyStop (var"#torelease#" , 0x00000000 )
563
573
end
564
574
# @show $lastboundexpr
565
575
$ _avx_call_
576
+ var"##do#thread##" || $ retexpr
566
577
# @show $retv
567
578
var"#thread#id#" = 0x00000000
568
579
var"#thread#mask#" = CheapThreads. mask (var"#threads#" )
@@ -579,8 +590,8 @@ function thread_two_loops_expr(
579
590
var"#threads#remain#" = var"#thread#mask#" ≠ 0x00000000
580
591
end
581
592
CheapThreads. free_threads! (var"#torelease#" )
593
+ $ retexpr
582
594
end
583
- length (ls. outer_reductions) > 0 ? push! (q. args, retv) : push! (q. args, nothing )
584
595
# @show
585
596
Expr (:block , ls. preamble, q)
586
597
end
0 commit comments