-
-
Notifications
You must be signed in to change notification settings - Fork 76
Open
Description
It seems that the nested loop example should get the following implementation for Threads
using Dagger, Random, Distributions, StatsBase, DataFrames
function f(dist, len, reps, σ)
v = Vector{Float64}(undef, len) # avoiding allocations
maximum(mean(rand!(dist, v)) for _ in 1:reps)/σ
end
function experiments_threads(dists, lens, K=1000)
res = DataFrame()
@sync for T in dists
dist = T()
σ = Threads.@spawn std(dist)
for L in lens
z = Threads.@spawn f(dist, L, K, fetch(σ))
push!(res, (;T, σ, L, z))
end
end
res.z = fetch.(res.z)
res.σ = fetch.(res.σ)
res
end
function experiments_dagger(dists, lens, K=1000)
res = DataFrame()
@sync for T in dists
dist = T()
σ = Dagger.@spawn std(dist)
for L in lens
z = Dagger.@spawn f(dist, L, K, σ)
push!(res, (;T, σ, L, z))
end
end
res.z = fetch.(res.z)
res.σ = fetch.(res.σ)
res
end
dists = [Cosine, Epanechnikov, Laplace, Logistic, Normal, NormalCanon, PGeneralizedGaussian, SkewNormal, SkewedExponentialPower, SymTriangularDist]
lens = [10, 20, 50, 100, 200, 500]
using BenchmarkTools
@btime experiments_dagger(dists, lens) # slightly slower, for 6 Threads, 574.444 ms (9740771 allocations: 271.22 MiB)
@btime experiments_threads(dists, lens) # slightly faster, for 6 Threads, 543.696 ms (9681150 allocations: 268.68 MiB) The differences in time might be pure randomness in this case.
However, even more confusing, if I am adding additional processes up front (after clean restart of julia)
using Distributed
Distributed.addprocs(2, exeflags=`--threads=3`)and then run the previous code, then @btime experiments_dagger(dists, lens) is not twice as fast (we added another 6 threads in total), but stays about the same in speed.
Metadata
Metadata
Assignees
Labels
No labels