|
162 | 162 | # block_per_m, blocks_per_n
|
163 | 163 | # end
|
164 | 164 |
|
165 |
| -@inline function choose_num_threads(::Val{C}, ::Val{NT}, x) where {C,NT} |
166 |
| - fx = Base.uitofp(Float64, x) |
167 |
| - min(Base.fptoui(UInt, Base.ceil_llvm(0.05460264079015985*C*Base.sqrt_llvm(fx))), NT) |
168 |
| -end |
| 165 | +@inline choose_num_threads(C::Float64, NT::UInt, x::Base.BitInteger) = _choose_num_threads(Base.FastMath.mul_float_fast(C, 0.05460264079015985), NT, x) |
| 166 | +@inline _choose_num_threads(C::Float64, NT::UInt, x::Base.BitInteger) = min(Base.fptoui(UInt, Base.ceil_llvm(Base.FastMath.mul_float_fast(C, Base.sqrt_llvm(Base.uitofp(Float64, x))))), NT) |
169 | 167 | function push_loop_length_expr!(q::Expr, ls::LoopSet)
|
170 | 168 | l = 1
|
171 | 169 | ndynamic = 0
|
@@ -328,12 +326,14 @@ function thread_one_loops_expr(
|
328 | 326 | ls::LoopSet, ua::UnrollArgs, valid_thread_loop::Vector{Bool}, ntmax::UInt, c::Float64,
|
329 | 327 | UNROLL::Tuple{Bool,Int8,Int8,Int,Int,Int,Int,Int,Int,Int,UInt}, OPS::Expr, ARF::Expr, AM::Expr, LPSYM::Expr
|
330 | 328 | )
|
| 329 | + looplen = looplengthprod(ls) |
| 330 | + c = 0.05460264079015985 * c / looplen |
331 | 331 | if all(isstaticloop, ls.loops)
|
332 |
| - _num_threads = choose_num_threads(Val(c), Val(ntmax), 1)::UInt |
| 332 | + _num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt |
333 | 333 | _num_threads > 1 || return avx_body(ls, UNROLL)
|
334 | 334 | choose_nthread = Expr(:(=), Symbol("#nthreads#"), _num_threads)
|
335 | 335 | else
|
336 |
| - choose_nthread = :(choose_num_threads(Val{$(c/looplengthprod(ls))}(), Val{$ntmax}())) |
| 336 | + choose_nthread = :(_choose_num_threads($c, $ntmax)) |
337 | 337 | push_loop_length_expr!(choose_nthread, ls)
|
338 | 338 | choose_nthread = Expr(:(=), Symbol("#nthreads#"), choose_nthread)
|
339 | 339 | end
|
@@ -444,12 +444,14 @@ function thread_two_loops_expr(
|
444 | 444 | ls::LoopSet, ua::UnrollArgs, valid_thread_loop::Vector{Bool}, ntmax::UInt, c::Float64,
|
445 | 445 | UNROLL::Tuple{Bool,Int8,Int8,Int,Int,Int,Int,Int,Int,Int,UInt}, OPS::Expr, ARF::Expr, AM::Expr, LPSYM::Expr
|
446 | 446 | )
|
| 447 | + looplen = looplengthprod(ls) |
| 448 | + c = 0.05460264079015985 * c / looplen |
447 | 449 | if all(isstaticloop, ls.loops)
|
448 |
| - _num_threads = choose_num_threads(Val(c), Val(ntmax), 1)::UInt |
| 450 | + _num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt |
449 | 451 | _num_threads > 1 || return avx_body(ls, UNROLL)
|
450 | 452 | choose_nthread = Expr(:(=), Symbol("#nthreads#"), _num_threads)
|
451 | 453 | else
|
452 |
| - choose_nthread = :(choose_num_threads(Val{$(c/looplengthprod(ls))}(), Val{$ntmax}())) |
| 454 | + choose_nthread = :(_choose_num_threads($c, $ntmax)) |
453 | 455 | push_loop_length_expr!(choose_nthread, ls)
|
454 | 456 | choose_nthread = Expr(:(=), Symbol("#nthreads#"), choose_nthread)
|
455 | 457 | end
|
|
0 commit comments