diff --git a/benchmark/benchmark_ondisk.jl b/benchmark/benchmark_ondisk.jl index 64220bc..cfde840 100644 --- a/benchmark/benchmark_ondisk.jl +++ b/benchmark/benchmark_ondisk.jl @@ -51,13 +51,13 @@ function psample_file_pop(data, rngs, n) push!(samples, s) push!(weights, Wtot) if length(samples) == 10 - samples = [combine(rngs[j], samples, weights),] + samples = [combine(rngs, samples, weights),] weights = [sum(weights),] end end end end - return combine(rngs[1], samples, weights) + return combine(rngs, samples, weights) end function sample_file_rs(data, rng, n, alg) @@ -97,7 +97,7 @@ function psample_file_st(data, rngs, n, alg) samples[i] = collect(StreamSampler{dtype}(rngs[i], @view(data[c]), wf, n, W, alg)) weights[i] = W end - return combine(rngs[1], samples, weights) + return combine(rngs, samples, weights) end filename = "random_data.arrow" @@ -117,10 +117,15 @@ precompile(sample_file_st, typeof.((data, rng, n, AlgORDWSWR()))) precompile(psample_file_st, typeof.((data, rngs, n, AlgORDWSWR()))) times = [] -for n in (totaltpl ÷ 100000, totaltpl ÷ 10000, totaltpl ÷ 1000) - t1 = @elapsed sample_file_pop(data, rng, n); - t2 = @elapsed psample_file_pop(data, rngs, n); - +for n in (totaltpl ÷ 100000, totaltpl ÷ 10000, totaltpl ÷ 1000, totaltpl ÷ 100) + + if n != totaltpl ÷ 100 + t1 = @elapsed sample_file_pop(data, rng, n); + t2 = @elapsed psample_file_pop(data, rngs, n); + else + t1 = nothing + t2 = nothing + end t3 = @elapsed sample_file_st(data, rng, n, AlgORDWSWR()); t4 = @elapsed psample_file_st(data, rngs, n, AlgORDWSWR()); @@ -133,9 +138,9 @@ times = hcat(times...) using CairoMakie -x = 1:3 -xtick_positions = [1,2,3] -xtick_labels = ["0.001%","0.01%","0.1%"] +x = 1:4 +xtick_positions = [1,2,3,4] +xtick_labels = ["0.001%","0.01%","0.1%","1%"] algonames = ["chunks", "chunks (4 threads)", "stream", "stream (4 threads)", "reservoir", "reservoir (4 threads)",] @@ -147,21 +152,21 @@ ax = Axis(fig[1, 1]; xlabel = "sample size", ylabel = "time (s)", xticks = (xtick_positions, xtick_labels), xgridstyle = :dot, ygridstyle = :dot, xticklabelsize = 10, yticklabelsize = 10, - xlabelsize = 12, ylabelsize = 12, + xlabelsize = 12, ylabelsize = 12 ) for i in 1:size(times, 1) - scatterlines!(ax, x, times[i, :]; + scatterlines!(ax, x, [x == nothing ? Inf : x for x in times[i, :]]; label = algonames[i], linestyle = (:dash, :dense), marker = markers[i], - markersize = 8, - linewidth = 2) + markersize = 13, + linewidth = 2,) end - +ylims!(low=0, high = 250) fig[2, 1] = Legend(fig, ax, framevisible = false, orientation = :horizontal, halign = :center, nbanks=2, fontsize=10) fig -save("comparison_ondisk_algs.pdf", fig) +save("comparison_ondisk_algs.svg", fig) diff --git a/docs/src/benchmark.md b/docs/src/benchmark.md index dd4a965..5bf6dd0 100644 --- a/docs/src/benchmark.md +++ b/docs/src/benchmark.md @@ -21,7 +21,7 @@ We also tried to evaluate the performance of the procedures on persistent data. performance of weighted sampling with replacement from 100 GB of data in the arrow format stored on disk: -![comparison_ondisk_algs](https://github.com/user-attachments/assets/622c5d03-07f2-428c-9bb5-6d6fcc629bec) +![comparison_ondisk_algs](https://github.com/user-attachments/assets/a6bc09a0-12c9-4a7b-9cc7-0e25edf35eba) the "chunks" method uses `StatsBase.sample` along with the merging methods of this package to sample subsequent chunks of the stored data and then recombine the samples. The other methods employ the