|
159 | 159 | # ni = cld(N, fN)
|
160 | 160 | # block_per_m, blocks_per_n
|
161 | 161 | # end
|
162 |
| - |
163 |
| -@inline choose_num_threads(C::Float64, NT::UInt, x::Base.BitInteger) = _choose_num_threads(Base.FastMath.mul_float_fast(C, 0.05460264079015985), NT, x) |
| 162 | +if Sys.ARCH === :x86_64 |
| 163 | + @inline choose_num_threads(C::Float64, NT::UInt, x::Base.BitInteger) = _choose_num_threads(Base.FastMath.mul_float_fast(C, 0.05460264079015985), NT, x) |
| 164 | +else |
| 165 | + @inline choose_num_threads(C::Float64, NT::UInt, x::Base.BitInteger) = _choose_num_threads(Base.FastMath.mul_float_fast(C, 0.05460264079015985 * 0.25), NT, x) |
| 166 | +end |
164 | 167 | @inline _choose_num_threads(C::Float64, NT::UInt, x::Base.BitInteger) = min(Base.fptoui(UInt, Base.ceil_llvm(Base.FastMath.mul_float_fast(C, Base.sqrt_llvm(Base.uitofp(Float64, x))))), NT)
|
165 | 168 | function push_loop_length_expr!(q::Expr, ls::LoopSet)
|
166 | 169 | l = 1
|
@@ -326,6 +329,9 @@ function thread_one_loops_expr(
|
326 | 329 | )
|
327 | 330 | looplen = looplengthprod(ls)
|
328 | 331 | c = 0.05460264079015985 * c / looplen
|
| 332 | + if Sys.ARCH !== :x86_64 |
| 333 | + c *= 0.25 |
| 334 | + end |
329 | 335 | if all(isstaticloop, ls.loops)
|
330 | 336 | _num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt
|
331 | 337 | _num_threads > 1 || return avx_body(ls, UNROLL)
|
@@ -450,6 +456,9 @@ function thread_two_loops_expr(
|
450 | 456 | )
|
451 | 457 | looplen = looplengthprod(ls)
|
452 | 458 | c = 0.05460264079015985 * c / looplen
|
| 459 | + if Sys.ARCH !== :x86_64 |
| 460 | + c *= 0.25 |
| 461 | + end |
453 | 462 | if all(isstaticloop, ls.loops)
|
454 | 463 | _num_threads = _choose_num_threads(c, ntmax, Int64(looplen))::UInt
|
455 | 464 | _num_threads > 1 || return avx_body(ls, UNROLL)
|
@@ -521,6 +530,7 @@ function thread_two_loops_expr(
|
521 | 530 | $loopstart1
|
522 | 531 | var"#loop#1#start#init#" = var"#iter#start#0#"
|
523 | 532 | $loopstart2
|
| 533 | + # @show var"#nrequest#" |
524 | 534 | var"##do#thread##" = var"#nrequest#" ≠ 0x00000000
|
525 | 535 | if var"##do#thread##"
|
526 | 536 | var"#threads#", var"#torelease#" = CheapThreads.request_threads(Threads.threadid(), var"#nrequest#")
|
|
0 commit comments