Push test script

christiangnrd · christiangnrd · commit 088258b7617f · 2025-03-19T21:40:49.000-03:00
diff --git a/dev/flopscomp.jl b/dev/flopscomp.jl
@@ -0,0 +1,177 @@
+# using Pkg
+# Pkg.activate(temp=true)
+# Pkg.add(url="https://github.com/christiangnrd/Metal.jl/", rev="MPSGraph")
+# Pkg.add(["GPUArrays", "Plots"])
+
+# Uncomment if you want to compare with CPU
+# Pkg.add(["AppleAccelerate"])
+# using AppleAccelerate
+
+using Metal, GPUArrays, LinearAlgebra, Printf
+
+using Plots
+using Plots.Measures
+
+n_gpu_cores = "??"
+# Comment this out if scary. Please mention number of cores in your comment when uploading the figure
+system_prof = read(`system_profiler SPDisplaysDataType`, String)
+n_gpu_cores = only(match(r"Total Number of Cores:\s*(\d+)", system_prof).captures)
+
+PLOT_TITLE = "Matmul peakflops for $(device().name) ($n_gpu_cores GPU cores)"
+
+function cpupeakflops(; n::Integer=4096,
+                        n_batch::Integer=1,
+                        inT::DataType=Float32,
+                        outT::DataType=inT,
+                        ntrials::Integer=4,
+                        verify=true)
+    t = Base.zeros(Float64, ntrials)
+    n_batch == 1 || @warn "n_batch > 1 not supported for `mul!`, running with n_batch=1"
+    n_batch = 1
+    shape = (n, n)
+    for i=1:ntrials
+        c = zeros(outT, shape...)
+        a = ones(inT, shape...)
+        b = ones(inT, shape...)
+        t[i] = @elapsed mul!(c, a, b)
+        verify && @assert only(unique(Array(c))) == n
+    end
+
+    return n_batch*2*Float64(n)^3 / minimum(t)
+end
+function _peakflops(f, n, n_batch, inT, outT, ntrials; verify=true)
+    t = Base.zeros(Float64, ntrials)
+    shape = n_batch == 1 ? (n, n) : (n, n, n_batch)
+    for i=1:ntrials
+        c = mtl(zeros(outT, shape...))
+        a = mtl(ones(inT, shape...))
+        b = mtl(ones(inT, shape...))
+        t[i] = @elapsed Metal.@sync f(c, a, b)
+        verify && @assert only(unique(Array(c))) == n
+    end
+
+    return n_batch*2*Float64(n)^3 / minimum(t)
+end
+function gpuarrpeakflops(; n::Integer=4096,
+                           n_batch::Integer=1,
+                           inT::DataType=Float32,
+                           outT::DataType=inT,
+                           ntrials::Integer=3,
+                           verify=true)
+    n_batch == 1 || @warn "n_batch > 1 not supported for `GPUArrays.generic_matmatmul!`, running with n_batch=1"
+    _peakflops(n, 1, inT, outT, ntrials; verify) do c, a, b
+        GPUArrays.generic_matmatmul!(c, LinearAlgebra.wrap(a, 'N'), LinearAlgebra.wrap(b, 'N'), 1, 0)
+    end
+end
+function mpspeakflops(; n::Integer=4096,
+                        n_batch::Integer=1,
+                        inT::DataType=Float32,
+                        outT::DataType=inT,
+                        ntrials::Integer=3,
+                        verify=true)
+    _peakflops(MPS.matmul!, n, n_batch, inT, outT, ntrials; verify)
+end
+function graphpeakflops(; n::Integer=4096,
+                          n_batch::Integer=1,
+                          inT::DataType=Float32,
+                          outT::DataType=inT,
+                          ntrials::Integer=3,
+                          verify=true)
+    _peakflops(MPSGraphs.graph_matmul!, n, n_batch, inT, outT, ntrials; verify)
+end
+function anepeakflops(; kwargs...)
+    # VERY HACKY
+    newDesc = MPSGraphs.MPSGraphCompilationDescriptor()
+    # Use optimization level 0 to avoid operations being moved to the neural engine
+    newDesc.optimizationLevel = MPSGraphs.MPSGraphOptimizationLevel1
+
+    oldDesc = MPSGraphs._default_exec_desc[].compilationDescriptor
+
+    MPSGraphs._default_exec_desc[].compilationDescriptor = newDesc
+    res = graphpeakflops(; kwargs...)
+    MPSGraphs._default_exec_desc[].compilationDescriptor = oldDesc
+
+    return res
+end
+
+function compare(Ns, Fs, inT, outT=inT; n_batch=1, ntrials)
+    results = Dict()
+
+    newFs = if (outT == Float16 || (outT == Float32 && inT == Float16))
+        Fs
+    else
+        filter(x -> !occursin("ANE", x[2]),Fs)
+    end
+
+    for (_, info_str) in newFs
+        results[info_str] = Float64[]
+    end
+
+    prefixstr = "\33[2K\r($inT, $outT) "
+    @time "$((inT, outT))" begin
+        for n in Ns
+            verify = (n < maxintfloat(outT) && (inT != Float16 || (n < maxintfloat(inT))))
+            n_str = "$n: "
+            for (f, info_str) in newFs
+                print(prefixstr, n_str, info_str)
+                push!(results[info_str], f(; inT, outT, n, n_batch, ntrials, verify))
+                GC.gc()
+            end
+        end
+        print("\33[2K\r")
+    end
+    return results
+end
+
+function main(; Ns=[50, 64, 100, 128, 250, 256, 500, 512, 1000, 1024, 2000, 2048, 4000, 4096, 6000, 6144, 8000, 8192],#, 10000],
+                Fs=[
+                    (mpspeakflops, "MPS"),
+                    (graphpeakflops, "MPSGraph"),
+                    (anepeakflops, "MPSGraph (ANE)"),
+                    # (gpuarrpeakflops, "GPUArrays"),
+                    # (cpupeakflops, "CPU (AppleAccelerate)"), # Uncomment to test CPU performance
+                   ],
+                n_batch=1,
+                ntrials=5,
+                outpath="",
+                outtype="svg",
+                plt_title=PLOT_TITLE)
+    Ts=[
+        (Int8, Float16),
+        (Int8, Float32),
+        (Int16, Float32),
+        (Float16, Float16),
+        (Float16, Float32),
+        (Float32, Float32),
+        ]
+
+    res = Dict()
+
+    ylim_upper = 9e12
+
+    for (inT, outT) in Ts
+        tmpres = compare(Ns, Fs, inT, outT; n_batch, ntrials)
+
+        plt = plot(xlabel="N, n_batch=$(n_batch)", legendtitle="($inT, $outT)")
+        for (res, (_, info_str)) in zip(tmpres,Fs)
+            flops = tmpres[info_str]
+            peakf = @sprintf("%.3e", maximum(flops))
+            if maximum(flops) > ylim_upper
+                ylim_upper = maximum(flops) * 1.02
+            end
+            plot!(plt, Ns, tmpres[info_str]; linewidth=1.5, label="$(peakf) peak: $info_str")
+        end
+        res[(inT,outT)] = (plt=plt, results=tmpres)
+    end
+
+    finalplot = plot(res[Ts[1]].plt, res[Ts[2]].plt, res[Ts[3]].plt, res[Ts[4]].plt, res[Ts[5]].plt, res[Ts[6]].plt; layout=(2,3),
+                     ylim=(0,ylim_upper),
+                     plot_title=plt_title,
+                     tickfonthalign=:left,
+                     bottommargin=15pt,
+                     size=(2000,1200))
+    if !isnothing(outpath)
+        savefig(plot(finalplot, dpi=500), joinpath(outpath, "bench_all_$(n_batch).$outtype"))
+    end
+    return res, finalplot
+end