87
87
# miter decreases in each iteration of factors
88
88
miter, niter = factors[i]
89
89
90
- r = ( MoW % miter)
90
+ r = MoW % miter
91
91
# if ((miter * W * U * 2) ≤ M - (W+W)) & ((r == 0) | (miter == (r+1)))
92
92
mlarge = (miter * (U * 2 )) ≤ MoW - 2
93
93
# we want `mlarge` enough, or there to be no remainder (`r == 0`)
103
103
@inbounds factors[(length (factors)+ 1 )>>> 1 ]
104
104
end
105
105
106
- # struct ChooseNumBlocks{U,C} <: Function end
107
- # function (cnb::ChooseNumBlocks{U,C})(M::UInt) where {U,C}
108
- # choose_num_blocks(M, StaticInt{U}(), StaticInt{C}())
109
- # end
110
-
111
- # @generated function choose_num_block_table(::StaticInt{U}, ::StaticInt{NC}) where {U,NC}
112
- # t = Expr(:tuple)
113
- # for n ∈ 1:NC
114
- # cnb = :(ChooseNumBlocks{$U,$n}())
115
- # push!(t.args, :(@cfunction($cnb, Tuple{UInt,UInt}, (UInt,))))
116
- # end
117
- # t
118
- # end
119
106
@generated function choose_num_block_table (:: StaticInt{NC} ) where {NC}
120
107
t = Expr (:tuple )
121
108
for n ∈ 1 : NC
130
117
nt,
131
118
:: StaticInt{NTMAX} ,
132
119
) where {U,NTMAX}
133
- # valid range for nt: 2 ≤ nt ≤ NTMAX
134
- # if NTMAX > 8
135
- # return quote
136
- # $(Expr(:meta,:inline))
137
- # choose_num_blocks_table(M, StaticInt{$U}(), nt, StaticInt{$NTMAX}())
138
- # end
139
- # else
140
120
if NTMAX == 2 # `nt` must be `2`
141
121
return quote
142
122
$ (Expr (:meta , :inline ))
@@ -166,16 +146,6 @@ function add_bisecting_if_branches!(q, lb, ub, U, isfirst::Bool)
166
146
return
167
147
end
168
148
169
- # @inline function choose_num_blocks_table(M, ::StaticInt{U}, nt, ::StaticInt{NTMAX}) where {U,NTMAX}
170
- # if nt == NTMAX
171
- # choose_num_blocks(M % UInt, StaticInt{U}(), StaticInt{NTMAX}())
172
- # else
173
- # @inbounds fptr = choose_num_block_table(StaticInt{U}(), StaticInt{NTMAX}())[nt]
174
- # VectorizationBase.assume(fptr ≠ C_NULL)
175
- # ccall(fptr, Tuple{UInt,UInt}, (UInt,), M%UInt)
176
- # end
177
- # end
178
-
179
149
# if a threaded loop is vectorized, call
180
150
@inline function choose_num_blocks (M, :: StaticInt{U} , nt) where {U}
181
151
_choose_num_blocks (M % UInt, StaticInt {U} (), nt, lv_max_num_threads ())
184
154
@inline choose_num_blocks (nt, :: StaticInt{NC} = lv_max_num_threads ()) where {NC} =
185
155
@inbounds choose_num_block_table (StaticInt {NC} ())[nt]
186
156
187
-
188
-
189
- # The goal is to minimimize the maximum costs...
190
- # But maybe 'relatively even sizes' heuristics are more robust than fancy modeling?
191
- # At least early on, before lots of test cases with different sorts of loops have informed the modeling.
192
- #
193
- # goal is to produce `nblocks` roughly even block sizes (bM, bN), such that `bM % fM == bN % fN == 0`.
194
- # function roughly_even_blocks(M, N, fM, fN, nblocks)
195
- # M_N_ratio = M / N
196
- # block_per_m = sqrt(nblocks * M_N_ratio) # obv not even
197
- # blocks_per_n = block_per_m / M_N_ratio
198
- # mi = cld(M, fM)
199
- # ni = cld(N, fN)
200
- # block_per_m, blocks_per_n
201
- # end
202
157
if Sys. ARCH === :x86_64
203
158
@inline function choose_num_threads (
204
159
C:: T ,
@@ -280,7 +235,6 @@ function outer_reduct_combine_expressions(ls::LoopSet, retv)
280
235
# push!(q.args, :(@show var"#load#thread#ret#"))
281
236
for (i, or) ∈ enumerate (ls. outer_reductions)
282
237
op = ls. operations[or]
283
- var = name (op)
284
238
mvar = mangledvar (op)
285
239
instr = instruction (op)
286
240
out = Symbol (mvar, " ##onevec##" )
@@ -328,7 +282,6 @@ function thread_loop_summary!(
328
282
)
329
283
W = ls. vector_width
330
284
@unpack u₁loop, u₂loop, vloop, u₁, u₂max = ua
331
- u₂ = u₂max
332
285
threadloopnumtag = Int (issecondthreadloop)
333
286
lensym = Symbol (" #len#thread#$threadloopnumtag #" )
334
287
define_len = if isstaticloop (threadedloop)
@@ -505,7 +458,7 @@ function thread_one_loops_expr(
505
458
thread_loop_summary! (ls, ua, threadedloop, false )
506
459
loopboundexpr = Expr (:tuple ) # for launched threads
507
460
lastboundexpr = Expr (:tuple ) # remainder, started on main thread
508
- for (i, loop) ∈ enumerate ( ls. loops)
461
+ for loop ∈ ls. loops
509
462
if loop === threadedloop
510
463
push! (loopboundexpr. args, looprange)
511
464
push! (lastboundexpr. args, lastrange)
@@ -546,8 +499,6 @@ function thread_one_loops_expr(
546
499
if var"##do#thread##"
547
500
var"#threads#tuple#" , var"#torelease#tuple#" =
548
501
PolyesterWeave. request_threads (var"#nrequest#" )
549
- # var"#threads#tuple#", var"#torelease#tuple#" = PolyesterWeave.request_threads(Threads.threadid()%UInt32, var"#nrequest#")
550
-
551
502
var"#thread#factor#0#" = var"#nthreads#"
552
503
$ iterdef
553
504
var"#thread#id#" = 0x00000000
@@ -627,7 +578,6 @@ function thread_one_loops_expr(
627
578
end
628
579
$ retexpr
629
580
end
630
- # Expr(:block, Expr(:meta,:inline), ls.preamble, q)
631
581
Expr (:block , ls. preamble, q)
632
582
end
633
583
function define_vthread_blocks (vloop, u₁loop, u₂loop, u₁, u₂, ntmax, tn)
@@ -710,7 +660,6 @@ function thread_two_loops_expr(
710
660
end
711
661
@unpack u₁loop, u₂loop, vloop, u₁, u₂max = ua
712
662
u₂ = u₂max
713
- W = ls. vector_width
714
663
threadedloop1 = getloop (ls, threadedid1)
715
664
threadedloop2 = getloop (ls, threadedid2)
716
665
define_len1, define_num_unrolls1, loopstart1, iterstop1, looprange1, lastrange1 =
@@ -719,7 +668,7 @@ function thread_two_loops_expr(
719
668
thread_loop_summary! (ls, ua, threadedloop2, true )
720
669
loopboundexpr = Expr (:tuple )
721
670
lastboundexpr = Expr (:tuple )
722
- for (i, loop) ∈ enumerate ( ls. loops)
671
+ for loop ∈ ls. loops
723
672
if loop === threadedloop1
724
673
push! (loopboundexpr. args, looprange1)
725
674
push! (lastboundexpr. args, lastrange1)
@@ -741,11 +690,9 @@ function thread_two_loops_expr(
741
690
Val (typeof (var"#avx#call#args#" )),
742
691
flatten_to_tuple (var"#avx#call#args#" )... ,
743
692
))
744
- # _turbo_orig_ = :(_turbo_!(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, var"#lv#tuple#args#"))
745
693
update_return_values = if length (ls. outer_reductions) > 0
746
694
retv = loopset_return_value (ls, Val (false ))
747
695
_turbo_call_ = Expr (:(= ), retv, _turbo_call_)
748
- # _turbo_orig_ = Expr(:(=), retv, _turbo_orig_)
749
696
outer_reduct_combine_expressions (ls, retv)
750
697
else
751
698
nothing
@@ -757,15 +704,10 @@ function thread_two_loops_expr(
757
704
retexpr = length (ls. outer_reductions) > 0 ? :(return $ retv) : :(return nothing )
758
705
q = quote
759
706
$ choose_nthread # UInt
760
- # @show var"#nthreads#"
761
707
$ loopstart1
762
708
$ loopstart2
763
709
var"##do#thread##" = var"#nthreads#" > one (var"#nthreads#" )
764
710
if var"##do#thread##"
765
- # if var"#nthreads#" ≤ 1
766
- # $_turbo_orig_
767
- # return $retexpr
768
- # end
769
711
$ define_len1
770
712
$ define_len2
771
713
$ define_num_unrolls1
@@ -913,7 +855,6 @@ function thread_two_loops_expr(
913
855
end
914
856
$ retexpr
915
857
end
916
- # Expr(:block, Expr(:meta,:inline), ls.preamble, q)
917
858
Expr (:block , ls. preamble, q)
918
859
end
919
860
@@ -957,8 +898,6 @@ function avx_threads_expr(
957
898
)
958
899
valid_thread_loop, ua, c = valid_thread_loops (ls)
959
900
num_candiates = sum (valid_thread_loop)
960
- # num_to_thread = min(num_candiates, 2)
961
- # candidate_ids =
962
901
if (num_candiates == 0 ) || (nt ≤ 1 ) # it was called from `avx_body` but now `nt` was set to `1`
963
902
avx_body (ls, UNROLL)
964
903
elseif (num_candiates == 1 ) || (nt ≤ 3 )
0 commit comments