Update benchmark code

ameligrana · web-flow · commit a4e7efcea14a · 2025-08-31T01:53:18.000+02:00
diff --git a/benchmark/benchmark_comparison_stream.jl b/benchmark/benchmark_comparison_stream.jl
@@ -1,82 +1,153 @@
+
 using StreamSampling, StatsBase
 using Random, Printf, BenchmarkTools
-using CairoMakie
+
+function samplesum(rng, stream, n, replace)
+    pop = collect(stream)
+    return sum(sample(rng, pop, n; replace))
+end
+function samplesum(rng, stream, wf, n, replace)
+    pop = collect(stream)
+    weights = wf.(pop)
+    return sum(sample(rng, pop, Weights(weights), n; replace))
+end
+
+function rsvsamplesum(rng, stream, wf, n, alg)
+    rs = ReservoirSampler{Int}(rng, n, alg; mutable=false)
+    if alg in (AlgL(), AlgRSWRSKIP())
+        for i in stream
+            rs = fit!(rs, i)
+        end
+    else
+        for i in stream
+            rs = fit!(rs, i, wf(i))
+        end
+    end
+    return sum(value(rs))
+end
+
+function strsamplesum(rng, stream, wf, n, alg, W=nothing)
+    W == nothing && (W = sum(wf(x) for x in stream))
+    st = if alg in (AlgD(), AlgORDSWR())
+        StreamSampler{Int}(rng, stream, n, W, alg)
+    else
+        StreamSampler{Int}(rng, stream, w, n, W, alg)
+    end
+    return sum(st)
+end
 
 rng = Xoshiro(42);
-stream = Iterators.filter(x -> x != 1, 1:10^8);
-pop = collect(stream);
-w(el) = Float64(el);
-weights = Weights(w.(stream));
-
-algs = (AlgL(), AlgRSWRSKIP(), AlgAExpJ(), AlgWRSWRSKIP());
-algsweighted = (AlgAExpJ(), AlgWRSWRSKIP());
-algsreplace = (AlgRSWRSKIP(), AlgWRSWRSKIP());
+stream = Iterators.filter(x -> x != 0, 1:10^8);
+W = 10^8
+w(el) = 1.0;
+w2(el) = 1;
+
+const algrsv = (AlgL(), AlgRSWRSKIP(), AlgAExpJ(), AlgWRSWRSKIP())
+const algstr = (AlgD(), AlgORDSWR(), nothing, AlgORDWSWR())
 sizes = (10^4, 10^5, 10^6, 10^7)
 
-p = Dict((0, 0) => 1, (0, 1) => 2, (1, 0) => 3, (1, 1) => 4);
-m_times = Matrix{Vector{Float64}}(undef, (3, 4));
+m_times = Matrix{Vector{Float64}}(undef, (4, 4));
 for i in eachindex(m_times) m_times[i] = Float64[] end
-m_mems = Matrix{Vector{Float64}}(undef, (3, 4));
+m_mems = Matrix{Vector{Float64}}(undef, (4, 4));
 for i in eachindex(m_mems) m_mems[i] = Float64[] end
 
-for m in algs
-    for size in sizes
-        replace = m in algsreplace
-        weighted = m in algsweighted
-        if weighted
-            b1 = @benchmark itsample($rng, $stream, $w, $size, $m) seconds=20
-            b2 = @benchmark sample($rng, collect($stream), Weights($w.($stream)), $size; replace = $replace) seconds=20
-            b3 = @benchmark sample($rng, $pop, $weights, $size; replace = $replace) seconds=20
-        else
-            b1 = @benchmark itsample($rng, $stream, $size, $m) evals=1 seconds=20
-            b2 = @benchmark sample($rng, collect($stream), $size; replace = $replace) seconds=20
-            b3 = @benchmark sample($rng, $pop, $size; replace = $replace) seconds=20
+for size in sizes
+    i = 0
+    for weighted in (false, true)
+        for replace in (false, true)
+            if weighted
+                b1 = @benchmark samplesum($rng, $stream, $w, $size, $replace) seconds=20
+            else
+                b1 = @benchmark samplesum($rng, $stream, $size, $replace) seconds=20
+            end
+            i += 1
+            push!(m_times[1, i], median(b1.times) * 1e-6)
+            push!(m_mems[1, i], b1.memory * 1e-6)
         end
-        ts = [median(b1.times), median(b2.times), median(b3.times)] .* 1e-6
-        ms = [b1.memory, b2.memory, b3.memory] .* 1e-6
-        c = p[(weighted, replace)]
-        for r in 1:3
-            push!(m_times[r, c], ts[r])
-            push!(m_mems[r, c], ms[r])
+    end
+end
+for n in sizes
+    i = 0
+    for alg in algrsv
+        b2 = @benchmark rsvsamplesum($rng, $stream, $w, $n, $alg) seconds=20
+        i += 1
+        push!(m_times[2, i], median(b2.times) * 1e-6)
+        push!(m_mems[2, i], b2.memory * 1e-6)
+    end
+end
+for n in sizes
+    i = 0
+    for alg in algstr
+        i += 1
+        alg == nothing && continue
+        if alg in (AlgD(), AlgORDSWR())
+            b3 = @benchmark strsamplesum($rng, $stream, $w2, $n, $alg) seconds=20
+            b4 = @benchmark strsamplesum($rng, $stream, $w2, $n, $alg, $W) seconds=20
+        else
+            b3 = @benchmark strsamplesum($rng, $stream, $w, $n, $alg) seconds=20
+            b4 = @benchmark strsamplesum($rng, $stream, $w, $n, $alg, $(Float64(W))) seconds=20
         end
-        println("c")
+        push!(m_times[3, i], median(b3.times) * 1e-6)
+        push!(m_mems[3, i], b3.memory * 1e-6)
+        push!(m_times[4, i], median(b4.times) * 1e-6)
+        push!(m_mems[4, i], b4.memory * 1e-6)
     end
 end
 
+using CairoMakie
+
 f = Figure(fontsize = 9,);
-axs = [Axis(f[i, j], yscale = log10, xscale = log10) for i in 1:4 for j in 1:2];
+axs = [Axis(f[i, j], yscale = log10, xscale = log10, xgridstyle = :dot,
+          ygridstyle = :dot) for i in 1:4 for j in 1:2];
 
-labels = (
-    "stream-based\n(StreamSampling.itsample)", 
-    "collection-based with setup\n(StatsBase.sample)", 
-    "collection-based\n(StatsBase.sample)"
-)
+labels = ("population", "reservoir", "stream", "stream - one pass" )
 
-markers = (:circle, :rect, :utriangle)
+markers = (:circle, :rect, :utriangle, :xcross)
 a, b = 0, 0
 
 for j in 1:8
     m = j in (3, 4, 7, 8) ? m_mems : m_times
     m == m_mems ? (a += 1) : (b += 1)
     s = m == m_mems ? a : b
-    for i in 1:3 
-        scatterlines!(axs[j], [0.01, 0.1, 1, 10], m[i, s]; label = labels[i], marker = markers[i])
+    for i in 1:4
+        length(m[i, s]) != 4 && continue
+        t = deepcopy(m[i, s])
+        scatterlines!(axs[j], [0.01, 0.1, 1, 10], t; label = labels[i], marker = markers[i], linestyle=(:dash, :dense))
+    end
+    if j in (1,3,5,7)
+        axs[j].ylabel = m == m_mems ? "memory (Mb)" : "time (ms)"
     end
-    axs[j].ylabel = m == m_mems ? "memory (Mb)" : "time (ms)"
     axs[j].xtickformat = x -> string.(x) .* "%"
-    j in (3, 4, 7, 8) && (axs[j].xlabel = "sample size")
+    j in (7, 8) && (axs[j].xlabel = "sample size")
     pr = j in (1, 2) ? "un" : ""
     t = j in (1, 5) ? "out" : "" 
     j in (1, 2, 5, 6) && (axs[j].title = pr * "weighted with" * t * " replacement")
     axs[j].titlegap = 8.0
     j in (1, 2, 5, 6) && hidexdecorations!(axs[j], grid = false)
 end
 
+for i in 1:8
+    axs[i].yticks = LogTicks(WilkinsonTicks(4, k_min=4, k_max=6))
+end
+
+linkyaxes!((axs[i] for i in [1,2,5,6])...)
+linkyaxes!((axs[i] for i in [3,4,7,8])...)
+
+for i in [2,4,6,8]
+    axs[i].yticklabelsvisible = false
+end
+for i in [3,4]
+    axs[i].xticklabelsvisible = false
+end
+
+
 f[5, 1] = Legend(f, axs[1], framevisible = false, orientation = :horizontal, 
         halign = :center, padding=(248,0,0,0))
 
-Label(f[0, :], "Comparison between stream-based and collection-based algorithms", fontsize = 13,
+Label(f[0, :], "Performance of Sampling Algorithms on Iterators", fontsize = 13,
     font=:bold)
 
-save("comparison_stream_algs.png", f)
 f
+
+save("comparison_stream_algs.pdf", f)
+