diff --git a/docs/src/literate/falsesharing/falsesharing.jl b/docs/src/literate/falsesharing/falsesharing.jl index 13caa44..22521c0 100644 --- a/docs/src/literate/falsesharing/falsesharing.jl +++ b/docs/src/literate/falsesharing/falsesharing.jl @@ -30,14 +30,14 @@ data = rand(1_000_000 * nthreads()); # # A common, manual implementation of this idea might look like this: -using OhMyThreads: @spawn, index_chunks +using OhMyThreads: @spawn, chunks function parallel_sum_falsesharing(data; nchunks = nthreads()) psums = zeros(eltype(data), nchunks) - @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks)) + @sync for (i, chunk) in enumerate(chunks(data; n = nchunks)) @spawn begin - for i in idcs - psums[c] += data[i] + for x in chunk + psums[i] += x end end end @@ -102,13 +102,13 @@ nthreads() function parallel_sum_tasklocal(data; nchunks = nthreads()) psums = zeros(eltype(data), nchunks) - @sync for (c, idcs) in enumerate(index_chunks(data; n = nchunks)) + @sync for (i, chunk) in enumerate(chunks(data; n = nchunks)) @spawn begin local s = zero(eltype(data)) - for i in idcs - s += data[i] + for x in chunk + s += x end - psums[c] = s + psums[i] = s end end return sum(psums) @@ -131,8 +131,8 @@ end # using `map` and reusing the built-in (sequential) `sum` function on each parallel task: function parallel_sum_map(data; nchunks = nthreads()) - ts = map(index_chunks(data, n = nchunks)) do idcs - @spawn @views sum(data[idcs]) + ts = map(chunks(data, n = nchunks)) do chunk + @spawn sum(chunk) end return sum(fetch.(ts)) end @@ -141,7 +141,7 @@ end @btime parallel_sum_map($data); # This implementation is conceptually -# clearer in that there is no explicit modification of shared state, i.e. no `pums[c] = s`, +# clearer in that there is no explicit modification of shared state, i.e. no `pums[i] = s`, # anywhere at all. We can't run into false sharing if we don't modify shared state 😉. # # Note that since we use the built-in `sum` function, which is highly optimized, we might see diff --git a/docs/src/literate/mc/mc.jl b/docs/src/literate/mc/mc.jl index 4ef3381..ae93bbe 100644 --- a/docs/src/literate/mc/mc.jl +++ b/docs/src/literate/mc/mc.jl @@ -79,16 +79,16 @@ using OhMyThreads: StaticScheduler # ## Manual parallelization # -# First, using the `index_chunks` function, we divide the iteration interval `1:N` into +# First, using the `chunks` function, we divide the iteration interval `1:N` into # `nthreads()` parts. Then, we apply a regular (sequential) `map` to spawn a Julia task # per chunk. Each task will locally and independently perform a sequential Monte Carlo # simulation. Finally, we fetch the results and compute the average estimate for $\pi$. -using OhMyThreads: @spawn, index_chunks +using OhMyThreads: @spawn, chunks function mc_parallel_manual(N; nchunks = nthreads()) - tasks = map(index_chunks(1:N; n = nchunks)) do idcs - @spawn mc(length(idcs)) + tasks = map(chunks(1:N; n = nchunks)) do chunk + @spawn mc(length(chunk)) end pi = sum(fetch, tasks) / nchunks return pi @@ -101,13 +101,13 @@ mc_parallel_manual(N) @btime mc_parallel_manual($N) samples=10 evals=3; # It is faster than `mc_parallel` above because the task-local computation -# `mc(length(idcs))` is faster than the implicit task-local computation within +# `mc(length(chunk))` is faster than the implicit task-local computation within # `tmapreduce` (which itself is a `mapreduce`). -idcs = first(index_chunks(1:N; n = nthreads())) +chunk = first(chunks(1:N; n = nthreads())) -@btime mapreduce($+, $idcs) do i +@btime mapreduce($+, $chunk) do i rand()^2 + rand()^2 < 1.0 end samples=10 evals=3; -@btime mc($(length(idcs))) samples=10 evals=3; +@btime mc($(length(chunk))) samples=10 evals=3;