Use MPSGraph for matrix multiplication

christiangnrd · christiangnrd · commit 8c7e260bad72 · 2025-03-16T22:43:17.000-03:00
diff --git a/lib/mpsgraphs/MPSGraphs.jl b/lib/mpsgraphs/MPSGraphs.jl
@@ -10,16 +10,37 @@ module MPSGraphs
 
 using ..Metal
 using .MTL
-using .MPS: MPSDataType, MPSMatrix, MPSVector, MPSShape, MPSNDArray
+using .MPS: MPSDataType, MPSMatrix, MPSVector, MPSShape, MPSNDArray, exportToMtlArray!
 
 using CEnum
 using ObjectiveC, .Foundation, .Dispatch
 
+# Valid combination of input (A and B matrices) and output (C) types
+# TODO: support the commented type combinations
+const MPSGRAPH_VALID_MATMUL_TYPES =
+    [
+    #  (Int8, Float16),
+    #  (Int8, Float32),
+    #  (Int16, Float32),
+     (Float16, Float16),
+    #  (Float16, Float32),
+     (Float32, Float32),
+    ]
+
+const MPSGRAPH_VALID_MATVECMUL_TYPES =
+    [
+     (Float16, Float16),
+    #  (Float16, Float32),
+     (Float32, Float32),
+    ]
+
 include("libmpsgraph.jl")
 
 include("core.jl")
 include("tensor.jl")
 include("operations.jl")
 include("random.jl")
 
+include("matmul.jl")
+
 end
diff --git a/lib/mpsgraphs/matmul.jl b/lib/mpsgraphs/matmul.jl
@@ -0,0 +1,57 @@
+function _matmul!(c::MPSMatrix, ::Type{T1}, a::MPSMatrix, ::Type{T2}, b::MPSMatrix, ::Type{T3}, alpha::Number, beta::Number, transpose_a, transpose_b) where {T1, T2, T3}
+    graph = MPSGraph()
+
+    placeA = placeholderTensor(graph, size(a), T2)
+    placeB = placeholderTensor(graph, size(b), T3)
+
+    transA = if transpose_a
+        transposeTensor(graph, placeA, 0, 1, "transpose_a")
+    else
+        placeA
+    end
+
+    transB = if transpose_b
+        transposeTensor(graph, placeB, 0, 1, "transpose_b")
+    else
+        placeB
+    end
+
+    matmul = matrixMultiplicationWithPrimaryTensor(graph, transB, transA)
+
+    afteralpha = if alpha == 1
+        matmul
+    else
+        alphatensor = constantWithScalar(graph, alpha, T1)
+        multiplicationWithPrimaryTensor(graph, alphatensor, matmul)
+    end
+
+    feed = Dict(
+        placeA => MPSGraphTensorData(a),
+        placeB => MPSGraphTensorData(b)
+    )
+
+    afterbeta = if beta == 0
+        afteralpha
+    else
+        placeC = placeholderTensor(graph, UInt.(size(c)), T1)
+        feed[placeC] = MPSGraphTensorData(c)
+        betatensor = constantWithScalar(graph, beta, T1)
+        betaC = multiplicationWithPrimaryTensor(graph, betatensor, placeC)
+        additionWithPrimaryTensor(graph, afteralpha, betaC)
+    end
+
+    res = run(graph, feed, [afterbeta])
+    resultdata = only(Dict{MPSGraphTensor, MPSGraphTensorData}(res)).second
+
+    return MPSNDArray(resultdata)
+end
+
+function graph_matmul!(c::MtlArray{T1, N}, a::MtlArray{T2, N}, b::MtlArray{T3, N}, alpha::Number = true, beta::Number = false, transpose_a = false, transpose_b = false) where {T1, T2, T3, N}
+    resultndarr = _matmul!(MPSMatrix(c), T1, MPSMatrix(a), T2, MPSMatrix(b), T3, alpha, beta, transpose_a, transpose_b)
+    return exportToMtlArray!(c, resultndarr)
+end
+
+function graph_matvecmul!(c::MtlVector{T1}, a::MtlMatrix{T2}, b::MtlVector{T3}, alpha::Number = true, beta::Number = false, transpose = false) where {T1, T2, T3}
+    resultndarr = _matmul!(MPSMatrix(c), T1, MPSMatrix(a), T2, MPSMatrix(b), T3, alpha, beta, transpose, false)
+    return exportToMtlArray!(c, resultndarr)
+end
diff --git a/lib/mpsgraphs/operations.jl b/lib/mpsgraphs/operations.jl
@@ -1,11 +1,44 @@
 
+function constantWithScalar(graph::MPSGraph, scalar::Number, dataType)
+    obj = @objc [graph::id{MPSGraph} constantWithScalar:scalar::Float64
+                                dataType:dataType::MPSDataType]::id{MPSGraphTensor}
+    MPSGraphTensor(obj)
+end
+
 function matrixMultiplicationWithPrimaryTensor(graph::MPSGraph, primary::MPSGraphTensor, secondary::MPSGraphTensor, name="matmul")
     obj = @objc [graph::id{MPSGraph} matrixMultiplicationWithPrimaryTensor:primary::id{MPSGraphTensor}
                                 secondaryTensor:secondary::id{MPSGraphTensor}
                                 name:name::id{NSString}]::id{MPSGraphTensor}
     MPSGraphTensor(obj)
 end
 
+function multiplicationWithPrimaryTensor(graph::MPSGraph, primary::MPSGraphTensor, secondary::MPSGraphTensor, name = "matmul")
+    obj = @objc [graph::id{MPSGraph} multiplicationWithPrimaryTensor:primary::id{MPSGraphTensor}
+                                secondaryTensor:secondary::id{MPSGraphTensor}
+                                name:name::id{NSString}]::id{MPSGraphTensor}
+    MPSGraphTensor(obj)
+end
+function additionWithPrimaryTensor(graph::MPSGraph, primary::MPSGraphTensor, secondary::MPSGraphTensor, name = "matmul")
+    obj = @objc [graph::id{MPSGraph} additionWithPrimaryTensor:primary::id{MPSGraphTensor}
+                                secondaryTensor:secondary::id{MPSGraphTensor}
+                                name:name::id{NSString}]::id{MPSGraphTensor}
+    MPSGraphTensor(obj)
+end
+
+function transposeTensor(graph::MPSGraph, tensor::MPSGraphTensor, dimension, withDimension, name = "transpose")
+    obj = @objc [graph::id{MPSGraph} transposeTensor:tensor::id{MPSGraphTensor}
+                                dimension:dimension::NSUInteger
+                                withDimension:withDimension::NSUInteger
+                                name:name::id{NSString}]::id{MPSGraphTensor}
+    MPSGraphTensor(obj)
+end
+
+function identityWithTensor(graph::MPSGraph, tensor::MPSGraphTensor, name = "identity")
+    obj = @objc [graph::id{MPSGraph} identityWithTensor:tensor::id{MPSGraphTensor}
+                                name:name::id{NSString}]::id{MPSGraphTensor}
+    MPSGraphTensor(obj)
+end
+
 run(graph::MPSGraph, feeds::Dict, targetTensors::Vector) = run(graph, MPSGraphTensorDataDictionary(feeds), NSArray(targetTensors))
 function run(graph::MPSGraph, feeds::MPSGraphTensorDataDictionary, targetTensors::NSArray)
     obj = @objc [graph::id{MPSGraph} runWithFeeds:feeds::id{MPSGraphTensorDataDictionary}
diff --git a/lib/mpsgraphs/tensor.jl b/lib/mpsgraphs/tensor.jl
@@ -82,8 +82,8 @@ function MPSGraphTensorData(vector::MPSVector)
     @objc [tensor::id{MPSGraphTensorData} initWithMPSVector:vector::id{MPSVector}]::id{MPSGraphTensorData}
     return tensor
 end
-MPSGraphTensorData(vector::MtlVector{T}) where T = MPSGraphTensorData(vector.data[], convert(MPSShape, size(vector)), T)
-# MPSGraphTensorData(vector::MtlVector) = MPSGraphTensorData(MPSVector(vector))
+# MPSGraphTensorData(vector::MtlVector{T}) where T = MPSGraphTensorData(vector.data[], convert(MPSShape, size(vector)), T)
+MPSGraphTensorData(vector::MtlVector) = MPSGraphTensorData(MPSVector(vector))
 
 # rank must be between 1 and 16 inclusive
 function MPSGraphTensorData(vector::MPSVector, rank)
diff --git a/src/linalg.jl b/src/linalg.jl
@@ -2,6 +2,24 @@ using LinearAlgebra
 using LinearAlgebra: MulAddMul, wrap
 using .MPS
 using .MPS: MPS_VALID_MATMUL_TYPES, MPS_VALID_MATVECMUL_TYPES, MtlFloat
+using .MPSGraphs: MPSGRAPH_VALID_MATMUL_TYPES, MPSGRAPH_VALID_MATVECMUL_TYPES,
+                  graph_matmul!, graph_matvecmul!
+
+@inline function supports_mps_matmul(A,B,C, valid_types)
+    MPS.is_supported(device(A)) &&
+        eltype(A) == eltype(B) &&
+        (eltype(A), eltype(C)) in valid_types
+end
+
+@inline function supports_mpsgraph_matmul(A,B,C, valid_types)
+    MPS.is_supported(device(A)) &&
+        eltype(A) == eltype(B) &&
+        (eltype(A), eltype(C)) in valid_types &&
+        # TODO: remove this limitation
+        A.offset == 0 &&
+        B.offset == 0 &&
+        C.offset == 0
+end
 
 LinearAlgebra.generic_matmatmul!(C::MtlMatrix, tA, tB, A::MtlMatrix, B::MtlMatrix, _add::MulAddMul) =
     LinearAlgebra.generic_matmatmul!(C, tA, tB, A, B, _add.alpha, _add.beta)
@@ -28,13 +46,10 @@ LinearAlgebra.generic_matmatmul!(C::MtlMatrix, tA, tB, A::MtlMatrix, B::MtlMatri
     transA = tA == 'T' || tA == 'C'
     transB = tB == 'T' || tB == 'C'
 
-    typA = eltype(A)
-    typB = eltype(B)
-    typC = eltype(C)
-
-    # If possible, dispatch to performance shaders
-    if MPS.is_supported(device()) &&
-            typA == typB && (typA, typC) in MPS_VALID_MATMUL_TYPES
+    # If possible, dispatch to MPSGraphs, then performance shaders
+    if supports_mpsgraph_matmul(A, B, C, MPSGRAPH_VALID_MATMUL_TYPES)
+        graph_matmul!(C, A, B, alpha, beta, transA, transB)
+    elseif supports_mps_matmul(A, B, C, MPS_VALID_MATMUL_TYPES)
         matmul!(C, A, B, alpha, beta, transA, transB)
     else
         GPUArrays.generic_matmatmul!(C, wrap(A, tA), wrap(B, tB), alpha, beta)
@@ -66,13 +81,10 @@ LinearAlgebra.generic_matvecmul!(C::MtlVector, tA::AbstractChar, A::MtlMatrix, B
 
     transA = tA == 'T' || tA == 'C'
 
-    typA = eltype(A)
-    typB = eltype(B)
-    typC = eltype(C)
-
-    # If possible, dispatch to performance shaders
-    if MPS.is_supported(device()) &&
-            typA == typB && (typA, typC) in MPS_VALID_MATVECMUL_TYPES
+    # If possible, dispatch to MPSGraphs, then performance shaders
+    if supports_mpsgraph_matmul(A, B, C, MPSGRAPH_VALID_MATVECMUL_TYPES)
+        graph_matvecmul!(C, A, B, alpha, beta, transA)
+    elseif supports_mps_matmul(A, B, C, MPS_VALID_MATVECMUL_TYPES)
         matvecmul!(C, A, B, alpha, beta, transA)
     else
         GPUArrays.generic_matmatmul!(C, wrap(A, tA), B, alpha, beta)
diff --git a/test/mpsgraphs/linalg.jl b/test/mpsgraphs/linalg.jl
@@ -0,0 +1,100 @@
+using LinearAlgebra
+
+
+if MPS.is_supported(device())
+
+@testset "mixed-precision matrix matrix multiplication" begin
+    N = 10
+    rows_a = N
+    cols_a = N
+
+    rows_b = N
+    cols_b = N
+
+    rows_c = rows_a
+    cols_c = cols_b
+
+    alpha = Float64(1)
+    beta  = Float64(1)
+
+    @testset "$(input_jl_type) => $accum_jl_type" for (input_jl_type, accum_jl_type) in MPSGraphs.MPSGRAPH_VALID_MATMUL_TYPES
+        arr_a = rand(input_jl_type, (rows_a, cols_a))
+        arr_b = rand(input_jl_type, (rows_b, cols_b))
+        arr_c = zeros(accum_jl_type, (rows_c, cols_c))
+
+        buf_a = MtlArray{input_jl_type}(arr_a)
+        buf_b = MtlArray{input_jl_type}(arr_b)
+        buf_c = MtlArray{accum_jl_type}(undef, (rows_c, cols_c))
+
+        truth_c = (alpha .* accum_jl_type.(arr_a)) * accum_jl_type.(arr_b) .+ (beta .* arr_c)
+
+        MPSGraphs.graph_matmul!(buf_c, buf_a, buf_b, alpha, beta)
+
+        @test all(Array(buf_c) .≈ truth_c)
+    end
+end
+
+# XXX: Batched matlmul not yet working
+# @testset "batched matrix matrix multiplication" begin
+#     N = 10
+#     batch_size = 3
+
+#     rows_a = N
+#     cols_a = N
+
+#     rows_b = N
+#     cols_b = N
+
+#     rows_c = rows_a
+#     cols_c = cols_b
+
+#     alpha = Float64(1)
+#     beta = Float64(1)
+
+#     @testset "$(input_jl_type) => $accum_jl_type" for (input_jl_type, accum_jl_type) in MPSGraphs.MPSGRAPH_VALID_MATMUL_TYPES
+#         arr_a = rand(input_jl_type, (rows_a, cols_a, batch_size))
+#         arr_b = rand(input_jl_type, (rows_b, cols_b, batch_size))
+#         arr_c = zeros(accum_jl_type, (rows_c, cols_c, batch_size))
+
+#         buf_a = MtlArray{input_jl_type}(arr_a)
+#         buf_b = MtlArray{input_jl_type}(arr_b)
+#         buf_c = MtlArray{accum_jl_type}(undef, (rows_c, cols_c, batch_size))
+
+#         truth_c = Array{accum_jl_type}(undef, (rows_c, cols_c, batch_size))
+#         for i in 1:batch_size
+#             @views truth_c[:, :, i] = (alpha .* accum_jl_type.(arr_a[:, :, i])) * accum_jl_type.(arr_b[:, :, i]) .+ (beta .* arr_c[:, :, i])
+#         end
+
+#         MPSGraphs.graph_matmul!(buf_c, buf_a, buf_b, alpha, beta)
+
+#         @test all(Array(buf_c) .≈ truth_c)
+#     end
+# end
+
+@testset "mixed-precision matrix vector multiplication" begin
+    N = 10
+    rows = N
+    cols = N
+
+    alpha = Float64(1)
+    beta  = Float64(0)
+
+    @testset "$(input_jl_type) => $accum_jl_type" for (input_jl_type, accum_jl_type) in MPSGraphs.MPSGRAPH_VALID_MATVECMUL_TYPES
+        arr_a = rand(input_jl_type, (rows, cols))
+        arr_b = rand(input_jl_type, (rows))
+        arr_c = zeros(accum_jl_type, (rows))
+
+        buf_a = MtlArray{input_jl_type}(arr_a)
+        buf_b = MtlArray{input_jl_type}(arr_b)
+        buf_c = MtlArray{accum_jl_type}(undef, (rows))
+
+        truth_c = (accum_jl_type(alpha) .* accum_jl_type.(arr_a)) * accum_jl_type.(arr_b) .+ (accum_jl_type(beta) .* arr_c)
+
+        MPSGraphs.graph_matvecmul!(buf_c, buf_a, buf_b, alpha, beta)
+
+        @test all(Array(buf_c) .≈ truth_c)
+        # @test Array(buf_c) ≈ truth_c
+    end
+end
+
+end