Use MPSGraph for matrix multiplication

christiangnrd · christiangnrd · commit 1432aca20abc · 2025-03-17T01:11:48.000-03:00
diff --git a/lib/mpsgraphs/MPSGraphs.jl b/lib/mpsgraphs/MPSGraphs.jl
@@ -10,16 +10,40 @@ module MPSGraphs
 
 using ..Metal
 using .MTL
-using .MPS: MPSDataType, MPSMatrix, MPSVector, MPSShape, MPSNDArray
+using .MPS: MPSDataType, MPSMatrix, MPSVector, MPSShape, MPSNDArray, exportToMtlArray!
 
 using CEnum
 using ObjectiveC, .Foundation, .Dispatch
 
+# Valid combination of input (A and B matrices) and output (C) types
+#   The commented type combinations work but are slower than with MPSMatrixMultiplicatiom
+const MPSGRAPH_VALID_MATMUL_TYPES =
+    [
+    #  (Int8, Float16),
+    #  (Int8, Float32),
+    #  (Int16, Float32),
+     (Float16, Float16),
+     (Float16, Float32),
+     (Float32, Float32),
+    ]
+
+const MPSGRAPH_VALID_MATVECMUL_TYPES =
+    [
+     (Int8, Float16),
+     (Int8, Float32),
+     (Int16, Float32),
+     (Float16, Float16),
+     (Float16, Float32),
+     (Float32, Float32),
+    ]
+
 include("libmpsgraph.jl")
 
 include("core.jl")
 include("tensor.jl")
 include("operations.jl")
 include("random.jl")
 
+include("matmul.jl")
+
 end
diff --git a/lib/mpsgraphs/matmul.jl b/lib/mpsgraphs/matmul.jl
@@ -0,0 +1,64 @@
+function _matmul!(c::MPSMatrix, ::Type{Tc}, a::MPSMatrix, b::MPSMatrix, ::Type{Tab}, alpha::Number, beta::Number, transpose_a, transpose_b) where {Tc, Tab}
+    graph = MPSGraph()
+
+    placeA = placeholderTensor(graph, size(a), Tab)
+    placeB = placeholderTensor(graph, size(b), Tab)
+
+    castA, castB = if Tc != Tab
+        castTensor(graph, placeA, Tc, "castA"),
+            castTensor(graph, placeB, Tc, "castB")
+    else
+        placeA, placeB
+    end
+
+    transA = if transpose_a
+        transposeTensor(graph, castA, 0, 1, "transpose_a")
+    else
+        castA
+    end
+
+    transB = if transpose_b
+        transposeTensor(graph, castB, 0, 1, "transpose_b")
+    else
+        castB
+    end
+
+    matmul = matrixMultiplicationWithPrimaryTensor(graph, transB, transA)
+
+    afteralpha = if alpha == 1
+        matmul
+    else
+        alphatensor = constantWithScalar(graph, alpha, Tc)
+        multiplicationWithPrimaryTensor(graph, alphatensor, matmul)
+    end
+
+    feed = Dict(
+        placeA => MPSGraphTensorData(a),
+        placeB => MPSGraphTensorData(b)
+    )
+
+    afterbeta = if beta == 0
+        afteralpha
+    else
+        placeC = placeholderTensor(graph, UInt.(size(c)), Tc)
+        feed[placeC] = MPSGraphTensorData(c)
+        betatensor = constantWithScalar(graph, beta, Tc)
+        betaC = multiplicationWithPrimaryTensor(graph, betatensor, placeC)
+        additionWithPrimaryTensor(graph, afteralpha, betaC)
+    end
+
+    res = run(graph, feed, [afterbeta])
+    resultdata = only(Dict{MPSGraphTensor, MPSGraphTensorData}(res)).second
+
+    return MPSNDArray(resultdata)
+end
+
+function graph_matmul!(c::MtlArray{Tc, N}, a::MtlArray{Tab, N}, b::MtlArray{Tab, N}, alpha::Number = true, beta::Number = false, transpose_a = false, transpose_b = false) where {Tc, Tab, N}
+    resultndarr = _matmul!(MPSMatrix(c), Tc, MPSMatrix(a), MPSMatrix(b), Tab, alpha, beta, transpose_a, transpose_b)
+    return exportToMtlArray!(c, resultndarr)
+end
+
+function graph_matvecmul!(c::MtlVector{Tc}, a::MtlMatrix{Tab}, b::MtlVector{Tab}, alpha::Number = true, beta::Number = false, transpose = false) where {Tc, Tab}
+    resultndarr = _matmul!(MPSMatrix(c), Tc, MPSMatrix(a), MPSMatrix(b), Tab, alpha, beta, transpose, false)
+    return exportToMtlArray!(c, resultndarr)
+end
diff --git a/lib/mpsgraphs/operations.jl b/lib/mpsgraphs/operations.jl
@@ -1,11 +1,51 @@
 
-function matrixMultiplicationWithPrimaryTensor(graph::MPSGraph, primary::MPSGraphTensor, secondary::MPSGraphTensor, name="matmul")
+function castTensor(graph::MPSGraph, tensor::MPSGraphTensor, toType, name = "cast")
+    obj = @objc [graph::id{MPSGraph} castTensor:tensor::id{MPSGraphTensor}
+                                toType:toType::MPSDataType
+                                name:name::id{NSString}]::id{MPSGraphTensor}
+    MPSGraphTensor(obj)
+end
+
+function constantWithScalar(graph::MPSGraph, scalar::Number, dataType)
+    obj = @objc [graph::id{MPSGraph} constantWithScalar:scalar::Float64
+                                dataType:dataType::MPSDataType]::id{MPSGraphTensor}
+    MPSGraphTensor(obj)
+end
+
+function matrixMultiplicationWithPrimaryTensor(graph::MPSGraph, primary::MPSGraphTensor, secondary::MPSGraphTensor, name = "matmul")
     obj = @objc [graph::id{MPSGraph} matrixMultiplicationWithPrimaryTensor:primary::id{MPSGraphTensor}
                                 secondaryTensor:secondary::id{MPSGraphTensor}
                                 name:name::id{NSString}]::id{MPSGraphTensor}
     MPSGraphTensor(obj)
 end
 
+function multiplicationWithPrimaryTensor(graph::MPSGraph, primary::MPSGraphTensor, secondary::MPSGraphTensor, name = "mul")
+    obj = @objc [graph::id{MPSGraph} multiplicationWithPrimaryTensor:primary::id{MPSGraphTensor}
+                                secondaryTensor:secondary::id{MPSGraphTensor}
+                                name:name::id{NSString}]::id{MPSGraphTensor}
+    MPSGraphTensor(obj)
+end
+function additionWithPrimaryTensor(graph::MPSGraph, primary::MPSGraphTensor, secondary::MPSGraphTensor, name = "add")
+    obj = @objc [graph::id{MPSGraph} additionWithPrimaryTensor:primary::id{MPSGraphTensor}
+                                secondaryTensor:secondary::id{MPSGraphTensor}
+                                name:name::id{NSString}]::id{MPSGraphTensor}
+    MPSGraphTensor(obj)
+end
+
+function transposeTensor(graph::MPSGraph, tensor::MPSGraphTensor, dimension, withDimension, name = "transpose")
+    obj = @objc [graph::id{MPSGraph} transposeTensor:tensor::id{MPSGraphTensor}
+                                dimension:dimension::NSUInteger
+                                withDimension:withDimension::NSUInteger
+                                name:name::id{NSString}]::id{MPSGraphTensor}
+    MPSGraphTensor(obj)
+end
+
+function identityWithTensor(graph::MPSGraph, tensor::MPSGraphTensor, name = "identity")
+    obj = @objc [graph::id{MPSGraph} identityWithTensor:tensor::id{MPSGraphTensor}
+                                name:name::id{NSString}]::id{MPSGraphTensor}
+    MPSGraphTensor(obj)
+end
+
 run(graph::MPSGraph, feeds::Dict, targetTensors::Vector) = run(graph, MPSGraphTensorDataDictionary(feeds), NSArray(targetTensors))
 function run(graph::MPSGraph, feeds::MPSGraphTensorDataDictionary, targetTensors::NSArray)
     obj = @objc [graph::id{MPSGraph} runWithFeeds:feeds::id{MPSGraphTensorDataDictionary}
diff --git a/lib/mpsgraphs/tensor.jl b/lib/mpsgraphs/tensor.jl
@@ -82,8 +82,8 @@ function MPSGraphTensorData(vector::MPSVector)
     @objc [tensor::id{MPSGraphTensorData} initWithMPSVector:vector::id{MPSVector}]::id{MPSGraphTensorData}
     return tensor
 end
-MPSGraphTensorData(vector::MtlVector{T}) where T = MPSGraphTensorData(vector.data[], convert(MPSShape, size(vector)), T)
-# MPSGraphTensorData(vector::MtlVector) = MPSGraphTensorData(MPSVector(vector))
+# MPSGraphTensorData(vector::MtlVector{T}) where T = MPSGraphTensorData(vector.data[], convert(MPSShape, size(vector)), T)
+MPSGraphTensorData(vector::MtlVector) = MPSGraphTensorData(MPSVector(vector))
 
 # rank must be between 1 and 16 inclusive
 function MPSGraphTensorData(vector::MPSVector, rank)
diff --git a/src/linalg.jl b/src/linalg.jl
@@ -2,6 +2,24 @@ using LinearAlgebra
 using LinearAlgebra: MulAddMul, wrap
 using .MPS
 using .MPS: MPS_VALID_MATMUL_TYPES, MPS_VALID_MATVECMUL_TYPES, MtlFloat
+using .MPSGraphs: MPSGRAPH_VALID_MATMUL_TYPES, MPSGRAPH_VALID_MATVECMUL_TYPES,
+                  graph_matmul!, graph_matvecmul!
+
+@inline function supports_mps_matmul(A, B, C, valid_types)
+    MPS.is_supported(device(A)) &&
+        eltype(A) == eltype(B) &&
+        (eltype(A), eltype(C)) in valid_types
+end
+
+@inline function supports_mpsgraph_matmul(A, B, C, valid_types)
+    MPS.is_supported(device(A)) &&
+        eltype(A) == eltype(B) &&
+        (eltype(A), eltype(C)) in valid_types &&
+        # TODO: remove this limitation
+        A.offset == 0 &&
+        B.offset == 0 &&
+        C.offset == 0
+end
 
 LinearAlgebra.generic_matmatmul!(C::MtlMatrix, tA, tB, A::MtlMatrix, B::MtlMatrix, _add::MulAddMul) =
     LinearAlgebra.generic_matmatmul!(C, tA, tB, A, B, _add.alpha, _add.beta)
@@ -28,13 +46,10 @@ LinearAlgebra.generic_matmatmul!(C::MtlMatrix, tA, tB, A::MtlMatrix, B::MtlMatri
     transA = tA == 'T' || tA == 'C'
     transB = tB == 'T' || tB == 'C'
 
-    typA = eltype(A)
-    typB = eltype(B)
-    typC = eltype(C)
-
-    # If possible, dispatch to performance shaders
-    if MPS.is_supported(device()) &&
-            typA == typB && (typA, typC) in MPS_VALID_MATMUL_TYPES
+    # If possible, dispatch to MPSGraphs, then performance shaders
+    if supports_mpsgraph_matmul(A, B, C, MPSGRAPH_VALID_MATMUL_TYPES)
+        graph_matmul!(C, A, B, alpha, beta, transA, transB)
+    elseif supports_mps_matmul(A, B, C, MPS_VALID_MATMUL_TYPES)
         matmul!(C, A, B, alpha, beta, transA, transB)
     else
         GPUArrays.generic_matmatmul!(C, wrap(A, tA), wrap(B, tB), alpha, beta)
@@ -66,13 +81,10 @@ LinearAlgebra.generic_matvecmul!(C::MtlVector, tA::AbstractChar, A::MtlMatrix, B
 
     transA = tA == 'T' || tA == 'C'
 
-    typA = eltype(A)
-    typB = eltype(B)
-    typC = eltype(C)
-
-    # If possible, dispatch to performance shaders
-    if MPS.is_supported(device()) &&
-            typA == typB && (typA, typC) in MPS_VALID_MATVECMUL_TYPES
+    # If possible, dispatch to MPSGraphs, then performance shaders
+    if supports_mpsgraph_matmul(A, B, C, MPSGRAPH_VALID_MATVECMUL_TYPES)
+        graph_matvecmul!(C, A, B, alpha, beta, transA)
+    elseif supports_mps_matmul(A, B, C, MPS_VALID_MATVECMUL_TYPES)
         matvecmul!(C, A, B, alpha, beta, transA)
     else
         GPUArrays.generic_matmatmul!(C, wrap(A, tA), B, alpha, beta)
diff --git a/test/mps/linalg.jl b/test/mps/linalg.jl
@@ -34,17 +34,19 @@ if MPS.is_supported(device())
 end
 
 @testset "batched matrix matrix multiplication" begin
-    N = 10
+    M = 8
+    N = 7
+    P = 9
     batch_size = 3
 
-    rows_a = N
+    rows_a = M
     cols_a = N
 
     rows_b = N
-    cols_b = N
+    cols_b = P
 
-    rows_c = rows_a
-    cols_c = cols_b
+    rows_c = M
+    cols_c = P
 
     alpha = Float64(1)
     beta = Float64(1)
diff --git a/test/mpsgraphs/linalg.jl b/test/mpsgraphs/linalg.jl
@@ -0,0 +1,102 @@
+using LinearAlgebra
+
+
+if MPS.is_supported(device())
+
+@testset "mixed-precision matrix matrix multiplication" begin
+    N = 10
+    rows_a = N
+    cols_a = N
+
+    rows_b = N
+    cols_b = N
+
+    rows_c = rows_a
+    cols_c = cols_b
+
+    alpha = Float64(1)
+    beta  = Float64(1)
+
+    @testset "$(input_jl_type) => $accum_jl_type" for (input_jl_type, accum_jl_type) in MPSGraphs.MPSGRAPH_VALID_MATMUL_TYPES
+        arr_a = rand(input_jl_type, (rows_a, cols_a))
+        arr_b = rand(input_jl_type, (rows_b, cols_b))
+        arr_c = zeros(accum_jl_type, (rows_c, cols_c))
+
+        buf_a = MtlArray{input_jl_type}(arr_a)
+        buf_b = MtlArray{input_jl_type}(arr_b)
+        buf_c = MtlArray{accum_jl_type}(undef, (rows_c, cols_c))
+
+        truth_c = (alpha .* accum_jl_type.(arr_a)) * accum_jl_type.(arr_b) .+ (beta .* arr_c)
+
+        MPSGraphs.graph_matmul!(buf_c, buf_a, buf_b, alpha, beta)
+
+        @test all(Array(buf_c) .≈ truth_c)
+    end
+end
+
+# XXX: Batched matlmul not yet working
+@testset "batched matrix matrix multiplication" begin
+    M = 8
+    N = 7
+    P = 9
+    batch_size = 3
+
+    rows_a = M
+    cols_a = N
+
+    rows_b = N
+    cols_b = P
+
+    rows_c = M
+    cols_c = P
+
+    alpha = Float64(1)
+    beta = Float64(1)
+
+    @testset "$(input_jl_type) => $accum_jl_type" for (input_jl_type, accum_jl_type) in MPSGraphs.MPSGRAPH_VALID_MATMUL_TYPES
+        arr_a = rand(input_jl_type, (rows_a, cols_a, batch_size))
+        arr_b = rand(input_jl_type, (rows_b, cols_b, batch_size))
+        arr_c = zeros(accum_jl_type, (rows_c, cols_c, batch_size))
+
+        buf_a = MtlArray{input_jl_type}(arr_a)
+        buf_b = MtlArray{input_jl_type}(arr_b)
+        buf_c = MtlArray{accum_jl_type}(undef, (rows_c, cols_c, batch_size))
+
+        truth_c = Array{accum_jl_type}(undef, (rows_c, cols_c, batch_size))
+        for i in 1:batch_size
+            @views truth_c[:, :, i] = (alpha .* accum_jl_type.(arr_a[:, :, i])) * accum_jl_type.(arr_b[:, :, i]) .+ (beta .* arr_c[:, :, i])
+        end
+
+        MPSGraphs.graph_matmul!(buf_c, buf_a, buf_b, alpha, beta)
+
+        @test all(Array(buf_c) .≈ truth_c)
+    end
+end
+
+@testset "mixed-precision matrix vector multiplication" begin
+    N = 10
+    rows = N
+    cols = N
+
+    alpha = Float64(1)
+    beta  = Float64(0)
+
+    @testset "$(input_jl_type) => $accum_jl_type" for (input_jl_type, accum_jl_type) in MPSGraphs.MPSGRAPH_VALID_MATVECMUL_TYPES
+        arr_a = rand(input_jl_type, (rows, cols))
+        arr_b = rand(input_jl_type, (rows))
+        arr_c = zeros(accum_jl_type, (rows))
+
+        buf_a = MtlArray{input_jl_type}(arr_a)
+        buf_b = MtlArray{input_jl_type}(arr_b)
+        buf_c = MtlArray{accum_jl_type}(undef, (rows))
+
+        truth_c = (accum_jl_type(alpha) .* accum_jl_type.(arr_a)) * accum_jl_type.(arr_b) .+ (accum_jl_type(beta) .* arr_c)
+
+        MPSGraphs.graph_matvecmul!(buf_c, buf_a, buf_b, alpha, beta)
+
+        @test all(Array(buf_c) .≈ truth_c)
+        # @test Array(buf_c) ≈ truth_c
+    end
+end
+
+end