Faster matmul sometimes (#580)

christiangnrd · web-flow · commit 770621195fd5 · 2025-04-18T11:58:55.000-03:00
* Use MPS instead of MPSGraph matmul when optimal

* Faster testing

* Fix

* Algorithm selection

* flopscomp improvements

* Fix &amp; tests

* No need for AppleAccelerate

* More specific error

* More tests
diff --git a/Project.toml b/Project.toml
@@ -21,6 +21,7 @@ Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
@@ -49,6 +50,7 @@ Preferences = "1"
 Printf = "1"
 Random = "1"
 SHA = "0.7"
+ScopedValues = "1.3.0"
 SpecialFunctions = "2"
 StaticArrays = "1"
 UUIDs = "1"
diff --git a/examples/flopscomp.jl b/examples/flopscomp.jl
@@ -1,5 +1,4 @@
-
-using Metal, GPUArrays, LinearAlgebra, Printf, AppleAccelerate
+using Metal, GPUArrays, LinearAlgebra, Printf#, AppleAccelerate
 
 testing = (@isdefined TESTING) && TESTING
 
@@ -8,14 +7,15 @@ testing = (@isdefined TESTING) && TESTING
     using Plots.Measures
 end
 
-const Ts=[
-        (Int8, Float16),
-        (Int8, Float32),
-        (Int16, Float32),
-        (Float16, Float16),
-        (Float16, Float32),
-        (Float32, Float32),
-    ]
+Ts=[
+    (Int8, Float16),
+    (Int8, Float32),
+    (Int16, Float32),
+    (Float16, Float16),
+    (Float16, Float32),
+    (Float32, Float32),
+]
+DEFAULT_NS = [50, 64, 100, 128, 250, 256, 500, 512, 1000, 1024, 1500, 2000, 2048, 2500, 3000, 4000, 4096, 5000, 6000, 6144, 8000, 8192]
 
 n_gpu_cores = "??"
 # Comment this out if scary. Please mention number of cores in your comment when uploading the figure
@@ -68,6 +68,16 @@ function gpuarrpeakflops(; n::Integer=4096,
         GPUArrays.generic_matmatmul!(c, LinearAlgebra.wrap(a, 'N'), LinearAlgebra.wrap(b, 'N'), 1, 0)
     end
 end
+function defaultpeakflops(; n::Integer=4096,
+                           n_batch::Integer=1,
+                           inT::DataType=Float32,
+                           outT::DataType=inT,
+                           ntrials::Integer=3,
+                           verify=true)
+    _peakflops(n, 1, inT, outT, ntrials; verify) do c, a, b
+        LinearAlgebra.generic_matmatmul!(c, 'N', 'N', a, b, 1, 0)
+    end
+end
 function mpspeakflops(; n::Integer=4096,
                         n_batch::Integer=1,
                         inT::DataType=Float32,
@@ -128,25 +138,25 @@ function compare(Ns, Fs, inT, outT=inT; n_batch=1, ntrials)
     return results
 end
 
-function runcomparison(; Ns=[50, 64, 100, 128, 250, 256, 500, 512, 1000, 1024, 2000, 2048, 4000, 4096, 6000, 6144, 8000, 8192],#, 10000],
-                Fs=[
-                    (mpspeakflops, "MPS"),
-                    (graphpeakflops, "MPSGraph"),
-                    (anepeakflops, "MPSGraph (ANE)"),
-                    # (gpuarrpeakflops, "GPUArrays"),
-                    # (cpupeakflops, "CPU (AppleAccelerate)"), # Uncomment to test CPU performance
-                   ],
-                n_batch=1,
-                ntrials=5)
-    res = Dict()
+DEFAULT_FS = [
+    (mpspeakflops, "MPS"),
+    (graphpeakflops, "MPSGraph"),
+    (defaultpeakflops, "Default"),
+    # (anepeakflops, "MPSGraph (ANE)"),
+    # (gpuarrpeakflops, "GPUArrays"),
+    # (cpupeakflops, "CPU (AppleAccelerate)"), # Uncomment to test CPU performance
+]
 
+function runcomparison(; Ns=DEFAULT_NS, Fs=DEFAULT_FS, n_batch=1, ntrials=5)
+    res = Dict()
     for (inT, outT) in Ts
         res[(inT,outT)] = (n_batch, Ns, compare(Ns, Fs, inT, outT; n_batch, ntrials))
     end
     return res
 end
 
-function plot_results(res, Fs=["MPS", "MPSGraph", "MPSGraph (ANE)"]; outpath=nothing, outtype="svg", plt_title=PLOT_TITLE)
+function plot_results(res, Fs=DEFAULT_FS; outpath=nothing, outtype="svg", plt_title=PLOT_TITLE)
+    Fs = get.(Fs, 2, "You shouldn't be reading this")
     ylim_upper = 9e12
     resplts = []
 
@@ -164,7 +174,7 @@ function plot_results(res, Fs=["MPS", "MPSGraph", "MPSGraph (ANE)"]; outpath=not
             if maximum(flops) > ylim_upper
                 ylim_upper = maximum(flops) * 1.02
             end
-            plot!(plt, Ns, tmpres[info_str]; linewidth=1.5, label="$(peakf) peak: $info_str")
+            plot!(plt, Ns, tmpres[info_str]; linewidth=1.5, label="$(peakf) peak: $info_str", α=0.8)
         end
         push!(resplts, plt)
         push!(n_batches, n_batch)
@@ -184,4 +194,7 @@ end
 
 if testing
     runcomparison(Ns=[50, 64, 100, 128, 250, 256, 500, 512])
+elseif abspath(PROGRAM_FILE) == @__FILE__
+    res = runcomparison()
+    plot_results(res; outpath=".")
 end
diff --git a/src/Metal.jl b/src/Metal.jl
@@ -12,6 +12,7 @@ using ExprTools: splitdef, combinedef
 using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS
 import ObjectiveC: is_macos, darwin_version, macos_version
 import KernelAbstractions
+using ScopedValues
 
 include("version.jl")
 
diff --git a/src/linalg.jl b/src/linalg.jl
@@ -6,13 +6,13 @@ using .MPSGraphs: MPSGRAPH_VALID_MATMUL_TYPES, MPSGRAPH_VALID_MATVECMUL_TYPES,
                   graph_matmul!, graph_matvecmul!
 
 @inline function supports_mps_matmul(A, B, C, valid_types)
-    MPS.is_supported(device(A)) &&
+    MPS.is_supported(device(C)) &&
         eltype(A) == eltype(B) &&
         (eltype(A), eltype(C)) in valid_types
 end
 
 @inline function supports_mpsgraph_matmul(A, B, C, valid_types)
-    MPS.is_supported(device(A)) &&
+    MPS.is_supported(device(C)) &&
         eltype(A) == eltype(B) &&
         (eltype(A), eltype(C)) in valid_types &&
         # TODO: remove this limitation
@@ -21,6 +21,19 @@ end
         C.offset == 0
 end
 
+# Assumes support for MPS matrix multiplication has been verified elsewhere
+@inline function should_use_MPS(A, _, C)
+    rows = size(C,1)
+    cols = size(C,2)
+    # TODO: matvecmul different?
+    (eltype(A) <: Integer && rows <= 2000 && cols <= 2000 ) ||
+    eltype(A) <: AbstractFloat && rows <= 6000 && cols <= 6000 && Metal.supports_family(device(C), MTL.MTLGPUFamilyApple9)
+end
+
+# Supported values are :auto, :MPS, :MPSGraph, and :GPUArrays
+const matmul_alg = ScopedValue(:auto)
+matmul_alg_error(alg, inT, outT, vec) = error("Matrix-$(vec ? "Vector" : "Matrix") multiplication algorithm `:$alg` is not supported for input eltype $inT and output eltype $outT.")
+
 LinearAlgebra.generic_matmatmul!(C::MtlMatrix, tA, tB, A::MtlMatrix, B::MtlMatrix, _add::MulAddMul) =
     LinearAlgebra.generic_matmatmul!(C, tA, tB, A, B, _add.alpha, _add.beta)
 @autoreleasepool function LinearAlgebra.generic_matmatmul!(C::MtlMatrix, tA, tB,
@@ -46,13 +59,20 @@ LinearAlgebra.generic_matmatmul!(C::MtlMatrix, tA, tB, A::MtlMatrix, B::MtlMatri
     transA = tA == 'T' || tA == 'C'
     transB = tB == 'T' || tB == 'C'
 
+    alg = matmul_alg[]
+    mps_supported = supports_mps_matmul(A, B, C, MPS_VALID_MATMUL_TYPES)
+    mpsgraph_supported = supports_mpsgraph_matmul(A, B, C, MPSGRAPH_VALID_MATMUL_TYPES)
     # If possible, dispatch to MPSGraphs, then performance shaders
-    if supports_mpsgraph_matmul(A, B, C, MPSGRAPH_VALID_MATMUL_TYPES)
+    if alg === :MPSGraph || (alg === :auto && mpsgraph_supported && !should_use_MPS(A, B, C))
+        mpsgraph_supported || matmul_alg_error(alg, eltype(A), eltype(C), false)
         graph_matmul!(C, A, B, alpha, beta, transA, transB)
-    elseif supports_mps_matmul(A, B, C, MPS_VALID_MATMUL_TYPES) # TODO: Remove once contiguous views are working
+    elseif alg === :MPS || (alg === :auto && mps_supported)
+        mps_supported || matmul_alg_error(alg, eltype(A), eltype(C), false)
         matmul!(C, A, B, alpha, beta, transA, transB)
-    else
+    elseif alg === :GPUArrays || alg === :auto
         GPUArrays.generic_matmatmul!(C, wrap(A, tA), wrap(B, tB), alpha, beta)
+    else
+        error(":$alg is not a valid matmul algorithm. Options are: `:auto`, `:MPS`, `:MPSGraph`, `:GPUArrays`")
     end
 end
 
@@ -81,13 +101,20 @@ LinearAlgebra.generic_matvecmul!(C::MtlVector, tA::AbstractChar, A::MtlMatrix, B
 
     transA = tA == 'T' || tA == 'C'
 
+    alg = matmul_alg[]
+    mps_supported = supports_mps_matmul(A, B, C, MPS_VALID_MATVECMUL_TYPES)
+    mpsgraph_supported = supports_mpsgraph_matmul(A, B, C, MPSGRAPH_VALID_MATVECMUL_TYPES)
     # If possible, dispatch to MPSGraphs, then performance shaders
-    if supports_mpsgraph_matmul(A, B, C, MPSGRAPH_VALID_MATVECMUL_TYPES)
+    if alg === :MPSGraph || (alg === :auto && mpsgraph_supported)
+        mpsgraph_supported || matmul_alg_error(alg, eltype(A), eltype(C), true)
         graph_matvecmul!(C, A, B, alpha, beta, transA)
-    elseif supports_mps_matmul(A, B, C, MPS_VALID_MATVECMUL_TYPES) # TODO: Remove once contiguous views are working
+    elseif alg === :MPS || (alg === :auto && mps_supported)
+        mps_supported || matmul_alg_error(alg, eltype(A), eltype(C), true)
         matvecmul!(C, A, B, alpha, beta, transA)
-    else
+    elseif alg === :GPUArrays || alg === :auto
         GPUArrays.generic_matmatmul!(C, wrap(A, tA), B, alpha, beta)
+    else
+        error(":$alg is not a valid matmul algorithm. Options are: `:auto`, `:MPS`, `:MPSGraph`, `:GPUArrays`")
     end
 end
 
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,6 +1,5 @@
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924"
 BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
@@ -14,6 +13,7 @@ ObjectiveC = "e86c9b32-1129-44ac-8ea0-90d5bb39ded9"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/test/linalg.jl b/test/linalg.jl
@@ -1,7 +1,56 @@
-using LinearAlgebra
+using LinearAlgebra, ScopedValues
 
 if MPS.is_supported(device())
 
+@testset "matmul algorithm selection" begin
+    # test that unsupported configurations error properly
+    N = 20
+    function test_matmul(inT, outT; vec_b=false, alg=:auto)
+        a = inT <: Integer ? inT.(rand(-5:5, N,N)) : rand(inT, N, N)
+
+        bdims = vec_b ? (N,) : (N, N)
+        b = inT <: Integer ? inT.(rand(-5:5, bdims)) : rand(inT, bdims)
+
+        ma = MtlArray(a)
+        mb = MtlArray(b)
+        mc = fill!(similar(mb, outT), zero(outT))
+
+        @with (Metal.matmul_alg => alg) mul!(mc,ma,mb)
+
+        return all((outT.(a)*outT.(b)) .≈ Array(mc))
+    end
+
+    for vec_b in (true, false)
+        @testset let vec_b = vec_b
+        # Unsupported for MPS and MPSGraph
+        @test_throws "Matrix-$(vec_b ? "Vector" : "Matrix") multiplication algorithm `:MPS`" test_matmul(Int8, Int16; vec_b, alg=:MPS)
+        @test_throws "Matrix-$(vec_b ? "Vector" : "Matrix") multiplication algorithm `:MPSGraph`" test_matmul(Int8, Int16; vec_b, alg=:MPSGraph)
+
+        # Invalid algorithm Symbol
+        @test_throws ":bad is not a valid matmul algorithm." test_matmul(Int8, Int16; vec_b, alg=:bad)
+        @test_throws ":bad is not a valid matmul algorithm." test_matmul(Float16, Float16; vec_b, alg=:bad)
+
+        # :auto
+        @test test_matmul(Int32, Int32; vec_b)     # fallback to GPUArrays
+        @test test_matmul(Int8, Float32; vec_b)    # should use MPS
+        @test test_matmul(Float16, Float32; vec_b) # should use MPSGraph on M1/M2
+
+        # :MPS
+        mpsInT = vec_b ? Float32 : Int16
+        @test test_matmul(mpsInT, Float32; vec_b, alg=:MPS)
+        @test test_matmul(Float16, Float32; vec_b, alg=:MPS)
+
+        # :MPSGraph
+        @test test_matmul(Int8, Float32; vec_b, alg=:MPSGraph)
+        @test test_matmul(Float16, Float32; vec_b, alg=:MPSGraph)
+
+        # :GPUArrays
+        @test test_matmul(Int32, Int32; vec_b, alg=:GPUArrays)
+        @test test_matmul(Int8, Float32; vec_b, alg=:GPUArrays)
+        @test test_matmul(Float16, Float32; vec_b, alg=:GPUArrays)
+        end
+    end
+end
 
 @testset "test matrix vector multiplication of views" begin
     N = 20