@@ -292,19 +292,19 @@ function lower_unrolled_dynamic(ls::LoopSet, us::UnrollSpecification, n::Int, in
292
292
end
293
293
end
294
294
q = if unsigned (Ureduct) < unsigned (UF) # unsigned(-1) == typemax(UInt);
295
- add_cleanup = true
295
+ add_cleanup = ! loopisstatic # true
296
296
if isone (Ureduct)
297
297
UF_cleanup = 1
298
298
if nisvectorized
299
299
blockhead = :while
300
300
else
301
301
blockhead = if UF == 2
302
- if loopisstatic
303
- add_cleanup = UFt == 1
304
- :block
305
- else
306
- :if
307
- end
302
+ if loopisstatic
303
+ # add_cleanup = UFt == 1
304
+ :block
305
+ else
306
+ :if
307
+ end
308
308
else
309
309
:while
310
310
end
@@ -319,6 +319,12 @@ function lower_unrolled_dynamic(ls::LoopSet, us::UnrollSpecification, n::Int, in
319
319
end
320
320
_q = if dynamicbounded
321
321
initialize_outer_reductions! (q, ls, Ureduct); q
322
+ elseif loopisstatic
323
+ if length (loop) < UF* W
324
+ Expr (:block )
325
+ else
326
+ Expr (:block , add_upper_outer_reductions (ls, q, Ureduct, UF, loop, nisvectorized))
327
+ end
322
328
else
323
329
Expr (:block , add_upper_outer_reductions (ls, q, Ureduct, UF, loop, nisvectorized))
324
330
end
@@ -587,6 +593,12 @@ function add_upper_outer_reductions(ls::LoopSet, loopq::Expr, Ulow::Int, Uhigh::
587
593
end
588
594
push! (ifq. args, t)
589
595
ifqfull = Expr (:let , ifqlet, ifq)
596
+ if isstaticloop (unrolledloop)
597
+ W = Core. ifelse (reductisvectorized, ls. vector_width, 1 )
598
+ if Uhigh* W* gethint (step (unrolledloop)) ≤ length (unrolledloop)
599
+ return Expr (:(= ), mvartl, ifqfull)
600
+ end
601
+ end
590
602
ncomparison = if reductisvectorized
591
603
add_upper_comp_check (unrolledloop, mulexpr (VECTORWIDTHSYMBOL, Uhigh, step (unrolledloop)))
592
604
elseif isknown (step (unrolledloop))
@@ -848,44 +860,49 @@ end
848
860
# cld(u₂, cld(u₂, unroll))
849
861
# end
850
862
function calc_Ureduct! (ls:: LoopSet , us:: UnrollSpecification )
851
- @unpack u₁loopnum, u₁, u₂, vloopnum = us
852
- ur = if iszero (length (ls. outer_reductions))
853
- - 1
854
- elseif u₂ == - 1
855
- if u₁loopnum == num_loops (ls)
856
- loopisstatic = isstaticloop (getloop (ls, u₁loopnum))
857
- loopisstatic &= ((vloopnum != u₁loopnum) | (! iszero (ls. vector_width)))
858
- # loopisstatic ? u₁ : min(u₁, 4) # much worse than the other two options, don't use this one
859
- if Sys. CPU_NAME === " znver1"
860
- loopisstatic ? u₁ : 1
861
- else
862
- loopisstatic ? u₁ : (u₁ ≥ 4 ? 2 : 1 )
863
- end
864
- else
865
- - 1
866
- end
863
+ @unpack u₁loopnum, u₁, u₂, vloopnum = us
864
+ ur = if iszero (length (ls. outer_reductions))
865
+ - 1
866
+ elseif u₂ == - 1
867
+ if u₁loopnum == num_loops (ls)
868
+ u₁loop = getloop (ls, u₁loopnum)
869
+ loopisstatic = isstaticloop (u₁loop)
870
+ loopisstatic &= ((vloopnum != u₁loopnum) | (! iszero (ls. vector_width)))
871
+ # loopisstatic ? u₁ : min(u₁, 4) # much worse than the other two options, don't use this one
872
+ if loopisstatic
873
+ W = Core. ifelse (vloopnum == u₁loopnum, ls. vector_width, 1 )
874
+ UFt = cld (length (u₁loop) % (W* u₁), W)
875
+ Core. ifelse (UFt == 0 , u₁, UFt)
876
+ # rem = length(u₁loop) -
877
+ # max(1, cld(rem, u₁))
878
+ else
879
+ Core. ifelse (Sys. CPU_NAME === " znver1" , 1 , Core. ifelse (u₁ ≥ 4 , 2 , 1 ))
880
+ end
867
881
else
868
- u₁ui = u₂ui = - 1
869
- u₁loopsym = getloop (ls, u₁loopnum). itersymbol
870
- u₂loopsym = getloop (ls, us. u₂loopnum). itersymbol
871
- vloopsym = getloop (ls, vloopnum). itersymbol
872
- for or ∈ ls. outer_reductions
873
- op = ls. operations[or]
874
- u₁u, u₂u = isunrolled_sym (op, u₁loopsym, u₂loopsym, vloopsym, us)
875
- if u₁ui == - 1
876
- u₁ui = Int (u₁u)
877
- u₂ui = Int (u₁u)
878
- elseif ! ((u₁ui == Int (u₁u)) & (u₂ui == Int (u₁u)))
879
- throw (ArgumentError (" Doesn't currenly handle differently unrolled reductions yet, please file an issue with an example." ))
880
- end
881
- end
882
- if u₁ui % Bool
883
- u₁
884
- else
885
- u₂
886
- end
882
+ - 1
887
883
end
888
- ls. ureduct = ur
884
+ else
885
+ u₁ui = u₂ui = - 1
886
+ u₁loopsym = getloop (ls, u₁loopnum). itersymbol
887
+ u₂loopsym = getloop (ls, us. u₂loopnum). itersymbol
888
+ vloopsym = getloop (ls, vloopnum). itersymbol
889
+ for or ∈ ls. outer_reductions
890
+ op = ls. operations[or]
891
+ u₁u, u₂u = isunrolled_sym (op, u₁loopsym, u₂loopsym, vloopsym, us)
892
+ if u₁ui == - 1
893
+ u₁ui = Int (u₁u)
894
+ u₂ui = Int (u₁u)
895
+ elseif ! ((u₁ui == Int (u₁u)) & (u₂ui == Int (u₁u)))
896
+ throw (ArgumentError (" Doesn't currenly handle differently unrolled reductions yet, please file an issue with an example." ))
897
+ end
898
+ end
899
+ if u₁ui % Bool
900
+ u₁
901
+ else
902
+ u₂
903
+ end
904
+ end
905
+ ls. ureduct = ur
889
906
end
890
907
ureduct (ls:: LoopSet ) = ls. ureduct
891
908
function lower_unrollspec (ls:: LoopSet )
0 commit comments