@@ -291,7 +291,7 @@ function allinteriorunrolled(ls::LoopSet, us::UnrollSpecification, N)
291
291
unroll_total ≤ 8
292
292
end
293
293
294
- function lower_no_unroll (ls:: LoopSet , us:: UnrollSpecification , n:: Int , inclmask:: Bool )
294
+ function lower_no_unroll (ls:: LoopSet , us:: UnrollSpecification , n:: Int , inclmask:: Bool , initialize :: Bool = true , maxiters :: Int = - 1 )
295
295
usorig = ls. unrollspecification[]
296
296
nisvectorized = isvectorized (us, n)
297
297
loopsym = names (ls)[n]
@@ -301,7 +301,7 @@ function lower_no_unroll(ls::LoopSet, us::UnrollSpecification, n::Int, inclmask:
301
301
# # return lower_llvm_unroll(ls, us, n, loop)
302
302
# end
303
303
# sl = startloop(loop, nisvectorized, loopsym)
304
- sl = startloop (ls, us, n)
304
+
305
305
tc = terminatecondition (ls, us, n, inclmask, 1 )
306
306
body = lower_block (ls, us, n, inclmask, 1 )
307
307
# align_loop = isone(n) & (ls.align_loops[] > 0)
@@ -319,11 +319,11 @@ function lower_no_unroll(ls::LoopSet, us::UnrollSpecification, n::Int, inclmask:
319
319
foreach (_ -> push! (q. args, body), 1 : (length (loop) ÷ W))
320
320
elseif nisvectorized
321
321
# Expr(:block, loopiteratesatleastonce(loop, true), Expr(:while, expect(tc), body))
322
- q = Expr (:block , Expr (:while , tc, body))
322
+ q = Expr (:block , Expr (maxiters == 1 ? :if : :while , tc, body))
323
323
else
324
324
termcond = gensym (:maybeterm )
325
325
push! (body. args, Expr (:(= ), termcond, tc))
326
- q = Expr (:block , Expr (:(= ), termcond, true ), Expr (:while , termcond, body))
326
+ q = Expr (:block , Expr (:(= ), termcond, true ), Expr (maxiters == 1 ? :if : :while , termcond, body))
327
327
# Expr(:block, Expr(:while, expect(tc), body))
328
328
# Expr(:block, assume(tc), Expr(:while, tc, body))
329
329
# push!(body.args, Expr(:&&, expect(Expr(:call, :!, tc)), Expr(:break)))
@@ -346,7 +346,11 @@ function lower_no_unroll(ls::LoopSet, us::UnrollSpecification, n::Int, inclmask:
346
346
push! (q. args, Expr (:if , tc, body))
347
347
end
348
348
end
349
- Expr (:block , Expr (:let , sl, q))
349
+ if initialize
350
+ Expr (:let , startloop (ls, us, n), q)
351
+ else
352
+ q
353
+ end
350
354
end
351
355
function lower_unrolled_dynamic (ls:: LoopSet , us:: UnrollSpecification , n:: Int , inclmask:: Bool )
352
356
UF = unrollfactor (us, n)
@@ -389,23 +393,49 @@ function lower_unrolled_dynamic(ls::LoopSet, us::UnrollSpecification, n::Int, in
389
393
end
390
394
remblock = Expr (:block )
391
395
(nisvectorized && (UFt > 0 ) && isone (num_loops (ls))) && push! (remblock. args, definemask (loop))
396
+ unroll_cleanup = true
392
397
else
393
398
remblock = init_remblock (loop, ls. lssm[], n)# loopsym)
399
+ # unroll_cleanup = Ureduct > 0 || (nisunrolled ? (u₂ > 1) : (u₁ > 1))
400
+ # remblock = unroll_cleanup ? init_remblock(loop, ls.lssm[], n)#loopsym) : Expr(:block)
394
401
q = Expr (:while , tc, body)
395
402
end
396
403
q = if unsigned (Ureduct) < unsigned (UF) # unsigned(-1) == typemax(UInt); is logic relying on twos-complement bad?
397
- UF_cleanup = UF - Ureduct
398
- us_cleanup = nisunrolled ? UnrollSpecification (us, UF_cleanup, u₂) : UnrollSpecification (us, u₁, UF_cleanup)
399
- Expr (
400
- :block ,
401
- add_upper_outer_reductions (ls, q, Ureduct, UF, loop, vectorized),
402
- Expr (
403
- # :if, terminatecondition(loop, us, n, loopsym, inclmask, UF_cleanup),
404
- :if , terminatecondition (ls, us, n, inclmask, UF_cleanup),
405
- lower_block (ls, us_cleanup, n, inclmask, UF_cleanup)
406
- ),
407
- remblock
408
- )
404
+ add_cleanup = true
405
+ if isone (Ureduct)
406
+ UF_cleanup = 1
407
+ if nisvectorized
408
+ blockhead = :while
409
+ else
410
+ blockhead = if UF == 2
411
+ if loopisstatic
412
+ add_cleanup = UFt == 1
413
+ :block
414
+ else
415
+ :if
416
+ end
417
+ else
418
+ :while
419
+ end
420
+ UFt = 0
421
+ end
422
+ elseif 2 Ureduct < UF
423
+ UF_cleanup = 2
424
+ blockhead = :while
425
+ else
426
+ UF_cleanup = UF - Ureduct
427
+ blockhead = :if
428
+ end
429
+ _q = Expr (:block , add_upper_outer_reductions (ls, q, Ureduct, UF, loop, vectorized, nisvectorized))
430
+ if add_cleanup
431
+ cleanup_expr = Expr (blockhead)
432
+ blockhead === :block || push! (cleanup_expr. args, terminatecondition (ls, us, n, inclmask, UF_cleanup))
433
+ us_cleanup = nisunrolled ? UnrollSpecification (us, UF_cleanup, u₂) : UnrollSpecification (us, u₁, UF_cleanup)
434
+ push! (cleanup_expr. args, lower_block (ls, us_cleanup, n, inclmask, UF_cleanup))
435
+ push! (_q. args, cleanup_expr)
436
+ end
437
+ UFt > 0 && push! (_q. args, remblock)
438
+ _q
409
439
elseif remfirst
410
440
numiters = length (loop) ÷ UF
411
441
if numiters > 2
@@ -440,10 +470,14 @@ function lower_unrolled_dynamic(ls::LoopSet, us::UnrollSpecification, n::Int, in
440
470
Expr ( :block , q, remblock )
441
471
end
442
472
if ! iszero (UFt)
473
+ # if unroll_cleanup
443
474
while true
444
475
ust = nisunrolled ? UnrollSpecification (us, UFt, u₂) : UnrollSpecification (us, u₁, UFt)
445
476
newblock = lower_block (ls, ust, n, remmask, UFt)
446
477
if (UFt ≥ UF - 1 + nisvectorized) || UFt == Ureduct || loopisstatic
478
+ if isone (num_loops (ls)) && isone (UFt) && isone (Ureduct)
479
+ newblock = Expr (:block , definemask (loop), newblock)
480
+ end
447
481
push! (remblock. args, newblock)
448
482
break
449
483
end
@@ -459,6 +493,11 @@ function lower_unrolled_dynamic(ls::LoopSet, us::UnrollSpecification, n::Int, in
459
493
end
460
494
UFt += 1
461
495
end
496
+ # else
497
+ # ust = nisunrolled ? UnrollSpecification(us, 1, u₂) : UnrollSpecification(us, u₁, 1)
498
+ # # newblock = lower_block(ls, ust, n, remmask, 1)
499
+ # push!(remblock.args, lower_no_unroll(ls, ust, n, inclmask, false, UF-1))
500
+ # end
462
501
end
463
502
Expr (:block , Expr (:let , sl, q))
464
503
end
@@ -529,26 +568,37 @@ end
529
568
function initialize_outer_reductions! (ls:: LoopSet , Umin:: Int , Umax:: Int , vectorized:: Symbol , suffix:: Union{Symbol,Nothing} = nothing )
530
569
initialize_outer_reductions! (ls. preamble, ls, Umin, Umax, vectorized, suffix)
531
570
end
532
- function add_upper_outer_reductions (ls:: LoopSet , loopq:: Expr , Ulow:: Int , Uhigh:: Int , unrolledloop:: Loop , vectorized:: Symbol )
533
- ifq = Expr (:block )
534
- initialize_outer_reductions! (ifq, ls, Ulow, Uhigh, vectorized)
535
- push! (ifq. args, loopq)
536
- reduce_range! (ifq, ls, Ulow, Uhigh)
537
- loopbuffer = Expr (:call , lv (:vmul ), VECTORWIDTHSYMBOL, Uhigh)
538
- comparison = if isstaticloop (unrolledloop)
539
- Expr (:call , lv (:scalar_less ), length (unrolledloop), loopbuffer)
571
+ function add_upper_comp_check (unrolledloop, loopbuffer)
572
+ if isstaticloop (unrolledloop)
573
+ Expr (:call , lv (:scalar_greaterequal ), length (unrolledloop), loopbuffer)
540
574
elseif unrolledloop. startexact
541
575
if isone (unrolledloop. starthint)
542
- Expr (:call , lv (:scalar_less ), unrolledloop. stopsym, loopbuffer)
576
+ Expr (:call , lv (:scalar_greaterequal ), unrolledloop. stopsym, loopbuffer)
543
577
else
544
- Expr (:call , lv (:scalar_less ), Expr (:call , lv (:vsub ), unrolledloop. stopsym, unrolledloop. starthint- 1 ), loopbuffer)
578
+ Expr (:call , lv (:scalar_greaterequal ), Expr (:call , lv (:vsub ), unrolledloop. stopsym, unrolledloop. starthint- 1 ), loopbuffer)
545
579
end
546
580
elseif unrolledloop. stopexact
547
- Expr (:call , lv (:scalar_less ), Expr (:call , lv (:vsub ), unrolledloop. stophint+ 1 , unrolledloop. startsym), loopbuffer)
581
+ Expr (:call , lv (:scalar_greaterequal ), Expr (:call , lv (:vsub ), unrolledloop. stophint+ 1 , unrolledloop. startsym), loopbuffer)
548
582
else # both are given by symbols
549
- Expr (:call , lv (:scalar_less ), Expr (:call , lv (:vsub ), unrolledloop. stopsym, Expr (:call ,lv (:vsub ),unrolledloop. startsym, staticexpr (1 ))), loopbuffer)
583
+ Expr (:call , lv (:scalar_greaterequal ), Expr (:call , lv (:vsub ), unrolledloop. stopsym, Expr (:call ,lv (:vsub ),unrolledloop. startsym, staticexpr (1 ))), loopbuffer)
584
+ end
585
+ end
586
+ function add_upper_outer_reductions (ls:: LoopSet , loopq:: Expr , Ulow:: Int , Uhigh:: Int , unrolledloop:: Loop , vectorized:: Symbol , reductisvectorized:: Bool )
587
+ ifq = Expr (:block )
588
+ initialize_outer_reductions! (ifq, ls, Ulow, Uhigh, vectorized)
589
+ push! (ifq. args, loopq)
590
+ _Ulow = Uhigh >>> 1 ; _Uhigh = Uhigh
591
+ while _Ulow > Ulow
592
+ reduce_range! (ifq, ls, _Ulow, _Uhigh)
593
+ _Uhigh = _Ulow
594
+ _Ulow >>>= 1
595
+ end
596
+ reduce_range! (ifq, ls, Ulow, _Uhigh)
597
+ ncomparison = if reductisvectorized
598
+ add_upper_comp_check (unrolledloop, Expr (:call , lv (:vmul ), VECTORWIDTHSYMBOL, Uhigh))
599
+ else
600
+ add_upper_comp_check (unrolledloop, Uhigh)
550
601
end
551
- ncomparison = Expr (:call , :! , comparison)
552
602
Expr (:if , ncomparison, ifq)
553
603
end
554
604
function reduce_expr! (q:: Expr , ls:: LoopSet , U:: Int )
@@ -760,7 +810,9 @@ function calc_Ureduct(ls::LoopSet, us::UnrollSpecification)
760
810
elseif u₂ == - 1
761
811
loopisstatic = isstaticloop (getloop (ls, names (ls)[u₁loopnum]))
762
812
loopisstatic &= ((vectorizedloopnum != u₁loopnum) | (! iszero (ls. vector_width[])))
763
- loopisstatic ? u₁ : min (u₁, 4 )
813
+ # loopisstatic ? u₁ : min(u₁, 4) # much worse than the other two options, don't use this one
814
+ loopisstatic ? u₁ : (u₁ ≥ 4 ? 2 : 1 )
815
+ # loopisstatic ? u₁ : 1
764
816
else
765
817
8 # u₂#u₁
766
818
# elseif num_loops(ls) == u₁loopnum
0 commit comments