@@ -246,11 +246,6 @@ function thread_loop_summary!(ls::LoopSet, ua::UnrollArgs, threadedloop::Loop, i
246
246
:($ lensym = $ ((threadedloop. lensym)) % UInt)
247
247
end
248
248
unroll_factor = Core. ifelse (threadedloop === vloop, W, 1 )
249
- # if threadedloop === u₁loop
250
- # unroll_factor *= u₁
251
- # elseif threadedloop === u₂loop
252
- # unroll_factor *= u₂
253
- # end
254
249
num_unroll_sym = Symbol (" #num#unrolls#thread#$threadloopnumtag #" )
255
250
define_num_unrolls = if unroll_factor == 1
256
251
:($ num_unroll_sym = $ lensym)
@@ -334,15 +329,19 @@ function define_block_size(threadedloop, vloop, tn, W)
334
329
end
335
330
end
336
331
end
332
+ function scale_cost (c, looplen)
333
+ c = 0.05 * c / looplen
334
+ if Sys. ARCH != = :x86_64
335
+ c *= 0.25
336
+ end
337
+ c
338
+ end
337
339
function thread_one_loops_expr (
338
340
ls:: LoopSet , ua:: UnrollArgs , valid_thread_loop:: Vector{Bool} , ntmax:: UInt , c:: Float64 ,
339
341
UNROLL:: Tuple{Bool,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt} , OPS:: Expr , ARF:: Expr , AM:: Expr , LPSYM:: Expr
340
342
)
341
343
looplen = looplengthprod (ls)
342
- c = 0.0225 * c / looplen
343
- if Sys. ARCH != = :x86_64
344
- c *= 0.25
345
- end
344
+ c = scale_cost (c, looplen)
346
345
if all (isstaticloop, ls. loops)
347
346
_num_threads = _choose_num_threads (c, ntmax, Int64 (looplen)):: UInt
348
347
_num_threads > 1 || return avx_body (ls, UNROLL)
@@ -376,7 +375,6 @@ function thread_one_loops_expr(
376
375
nothing
377
376
end
378
377
retexpr = length (ls. outer_reductions) > 0 ? :(return $ retv) : :(return nothing )
379
- # @unpack u₁loop, u₂loop, vloop, u₁, u₂max = ua
380
378
iterdef = define_block_size (threadedloop, ua. vloop, 0 , ls. vector_width)
381
379
q = quote
382
380
$ choose_nthread # UInt
@@ -479,10 +477,8 @@ function thread_two_loops_expr(
479
477
UNROLL:: Tuple{Bool,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt} , OPS:: Expr , ARF:: Expr , AM:: Expr , LPSYM:: Expr
480
478
)
481
479
looplen = looplengthprod (ls)
482
- c = 0.0225 * c / looplen
483
- if Sys. ARCH != = :x86_64
484
- c *= 0.25
485
- end
480
+ # c = 0.0225 * c / looplen
481
+ c = scale_cost (c, looplen)
486
482
if all (isstaticloop, ls. loops)
487
483
_num_threads = _choose_num_threads (c, ntmax, Int64 (looplen)):: UInt
488
484
_num_threads > 1 || return avx_body (ls, UNROLL)
0 commit comments