From 3de7216e725539ef9a56ee524c5475888d766c7d Mon Sep 17 00:00:00 2001 From: Chris Bieneman Date: Thu, 20 Nov 2025 15:42:32 -0600 Subject: [PATCH 1/2] [0035] Align matrix-vector APIs with coopvec This aligns the matrix-vector APIs with the SM 6.9 cooperative vector feature such that the matrix is an `A` matrix and the vectors are column vectors rather than row vectors. --- proposals/0035-linalg-matrix.md | 113 +++++++++++++++----------------- 1 file changed, 53 insertions(+), 60 deletions(-) diff --git a/proposals/0035-linalg-matrix.md b/proposals/0035-linalg-matrix.md index 10d69215..869e8dce 100644 --- a/proposals/0035-linalg-matrix.md +++ b/proposals/0035-linalg-matrix.md @@ -190,42 +190,39 @@ Multiply(const Matrix, template -vector Multiply(vector, - Matrix); +vector Multiply(Matrix, + vector); template -vector MultiplyAdd(vector, - Matrix, - vector); +vector MultiplyAdd(Matrix, + vector, vector); -template +template typename hlsl::enable_if::Size == M, vector >::type - MultiplyAdd(InterpretedVector, - Matrix, + MultiplyAdd(Matrix, + InterpretedVector, vector); template vector - MultiplyAdd(vector, - Matrix, - VectorRef); + MultiplyAdd(Matrix, + vector, VectorRef); -template +template typename hlsl::enable_if::Size == M, vector >::type - MultiplyAdd(InterpretedVector, - Matrix, + MultiplyAdd(Matrix, + InterpretedVector, VectorRef); // Outer product functions @@ -282,32 +279,30 @@ ByteAddressBuffer B : register(t0); void CoopVec() { using namespace dx::linalg; - using MatrixBTy = Matrix; + using MatrixBTy = + Matrix; vector Vec = (vector)0; MatrixBTy MatB = MatrixBTy::Load( MBuf, 0, /* Row stride = number of columns * element size */ 16 * 4, MatrixLayout::RowMajor); - vector Layer1 = Multiply(Vec, MatB); + vector Layer1 = Multiply(MatB, Vec); vector NullBias = (vector)0; - vector Layer2 = MultiplyAdd(Layer1, MatB, NullBias); + vector Layer2 = MultiplyAdd(MatB, Layer1, NullBias); VectorRef MemBias = {MBuf, - /*start offset*/ 4096}; - vector Layer3 = MultiplyAdd(Layer2, MatB, MemBias); + /*start offset*/ 4096}; + vector Layer3 = MultiplyAdd(MatB, Layer2, MemBias); // Clang doesn't yet support packed types. #ifdef __hlsl_dx_compiler vector SomeData = (vector)0; vector Layer4 = MultiplyAdd( - MakeInterpretedVector(SomeData), MatB, - MemBias); + MatB, MakeInterpretedVector(SomeData), MemBias); vector Layer5 = MultiplyAdd( - MakeInterpretedVector(SomeData), MatB, - NullBias); + MatB, MakeInterpretedVector(SomeData), NullBias); #endif } ``` @@ -416,7 +411,7 @@ The following table summarizes the operations supported for each matrix scope: | `Matrix::SumAccumulate()` | ✗ | ✓ | ✓ | | `linalg::Multiply(Matrix, Matrix)` | ✗ | ✓ | ✓ | | `linalg::Multiply(vector, Matrix)` | ✓ | ✗ | ✗ | -| `linalg::MultiplyAdd(vector, Matrix, vector)` | ✓ | ✗ | ✗ | +| `linalg::MultiplyAdd(Matrix, vector, vector)` | ✓ | ✗ | ✗ | | `linalg::OuterProduct(vector, vector)` | ✓ | ✓ | ✓ | Throughout this document a matrix may be described as having a scope as @@ -925,21 +920,21 @@ infers the type of the output accumulator to match the input vector element type the other overload takes a template parameter for the output matrix element type. All matrix scopes are allowed for the output matrix. -#### linalg::MultiplyAdd(vector, Matrix, vector) +#### linalg::MultiplyAdd(Matrix, vector, vector) ``` c++ template vector - linalg::MultiplyAdd(vector, - Matrix, + linalg::MultiplyAdd(Matrix, + vector, vector); ``` Requires `Thread` scope matrix input, may be called from divergent control flow. The `linalg::MultiplyAdd` function has an overload that takes an `M`-element, an -MxK `B` matrix with `Thread` scope, and a `K`-element vector. The operation +MxK `A` matrix with `Thread` scope, and a `K`-element vector. The operation multiplies the `M`-element vector by the matrix then adds the `K`-element vector producing a result `K`-element vector. @@ -1209,37 +1204,37 @@ Must be called from wave-uniform control flow. ``` llvm declare <[NUMo] x [TYo]> @dx.op.matvecmul.v[NUMo][TYo].v[NUMi][TYi]( immarg i32, ; opcode + %dx.types.MatrixRef, ; matrix A <[NUMi] x [TYi]>, ; input vector - immarg i32, ; input interpretation type (DXILComponentType) - %dx.types.MatrixRef ; matrix A + immarg i32 ; input interpretation type (DXILComponentType) ) ``` -This operation implements a row-vector multiplication against a `B` matrix of +This operation implements a row-vector multiplication against an `A` matrix of `Thread` scope. Validation will enforce that: * The input vector length matches the `M` matrix dimension -* The matrix A is a `B` matrix of `Thread` scope +* The matrix A is an `A` matrix of `Thread` scope ``` llvm declare <[NUMo] x [TYo]> @dx.op.matvecmuladd.v[NUMo][TYo].v[NUMi][TYi].v[NUMo][TYb]( immarg i32, ; opcode + %dx.types.MatrixRef, ; matrix A <[NUMi] x [TYi]>, ; input vector immarg i32, ; input interpretation type (DXILComponentType) - %dx.types.MatrixRef, ; matrix A <[NUMo] x [TYb]>, ; bias vector immarg i32 ; bias interpretation type (DXILComponentType) ) ``` -This operation implements a row-vector multiplication against a `B` matrix of +This operation implements a row-vector multiplication against an `A` matrix of `Thread` scope with a bias vector added to the result. Validation will enforce that: * The input vector length matches the `M` matrix dimension * The bias vector length matches the `N` matrix dimension -* The matrix A is a `B` matrix of `Thread` scope +* The matrix A is an `A` matrix of `Thread` scope ```llvm declare void @dx.op.matrixAccumulateToDescriptor( @@ -1371,7 +1366,7 @@ in the [`DXILComponentType` enumeration](#dxil-enumerations). ## Appendix 2: HLSL Header -[Compiler Explorer](https://godbolt.org/z/W5a7zbPr3) +[Compiler Explorer](https://godbolt.org/z/MG55ahKTE) > Note: this mostly works with Clang, but has some issues to work out still. ```cpp @@ -1636,15 +1631,14 @@ Multiply(const Matrix, template -vector Multiply(vector, - Matrix); +vector Multiply(Matrix, + vector); template -vector MultiplyAdd(vector, - Matrix, - vector); +vector MultiplyAdd(Matrix, + vector, vector); template ::Size == M, vector >::type - MultiplyAdd(InterpretedVector, - Matrix, + MultiplyAdd(Matrix, + InterpretedVector, vector); template vector - MultiplyAdd(vector, - Matrix, - VectorRef); + MultiplyAdd(Matrix, + vector, VectorRef); template ::Size == M, vector >::type - MultiplyAdd(InterpretedVector, - Matrix, + MultiplyAdd(Matrix, + InterpretedVector, VectorRef); // Outer product functions @@ -1720,29 +1713,29 @@ ByteAddressBuffer MBuf : register(t0); void CoopVec() { using namespace dx::linalg; using MatrixBTy = - Matrix; + Matrix; vector Vec = (vector)0; MatrixBTy MatB = MatrixBTy::Load( MBuf, 0, /* Row stride = number of columns * element size */ 16 * 4, MatrixLayout::RowMajor); - vector Layer1 = Multiply(Vec, MatB); + vector Layer1 = Multiply(MatB, Vec); vector NullBias = (vector)0; - vector Layer2 = MultiplyAdd(Layer1, MatB, NullBias); + vector Layer2 = MultiplyAdd(MatB, Layer1, NullBias); VectorRef MemBias = {MBuf, /*start offset*/ 4096}; - vector Layer3 = MultiplyAdd(Layer2, MatB, MemBias); + vector Layer3 = MultiplyAdd(MatB, Layer2, MemBias); // Clang doesn't yet support packed types. #ifdef __hlsl_dx_compiler vector SomeData = (vector)0; vector Layer4 = MultiplyAdd( - MakeInterpretedVector(SomeData), MatB, MemBias); + MatB, MakeInterpretedVector(SomeData), MemBias); vector Layer5 = MultiplyAdd( - MakeInterpretedVector(SomeData), MatB, NullBias); + MatB, MakeInterpretedVector(SomeData), NullBias); #endif } From 29913ed174adbd9a4c29fd387778aa11a3947eee Mon Sep 17 00:00:00 2001 From: Chris Bieneman Date: Fri, 21 Nov 2025 13:43:04 -0600 Subject: [PATCH 2/2] Update coopvec example more comprehensively. --- proposals/0035-linalg-matrix.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/proposals/0035-linalg-matrix.md b/proposals/0035-linalg-matrix.md index 869e8dce..5bb9a49e 100644 --- a/proposals/0035-linalg-matrix.md +++ b/proposals/0035-linalg-matrix.md @@ -279,21 +279,21 @@ ByteAddressBuffer B : register(t0); void CoopVec() { using namespace dx::linalg; - using MatrixBTy = + using MatrixATy = Matrix; vector Vec = (vector)0; - MatrixBTy MatB = MatrixBTy::Load( + MatrixATy MatA = MatrixATy::Load( MBuf, 0, /* Row stride = number of columns * element size */ 16 * 4, MatrixLayout::RowMajor); - vector Layer1 = Multiply(MatB, Vec); + vector Layer1 = Multiply(MatA, Vec); vector NullBias = (vector)0; - vector Layer2 = MultiplyAdd(MatB, Layer1, NullBias); + vector Layer2 = MultiplyAdd(MatA, Layer1, NullBias); VectorRef MemBias = {MBuf, /*start offset*/ 4096}; - vector Layer3 = MultiplyAdd(MatB, Layer2, MemBias); + vector Layer3 = MultiplyAdd(MatA, Layer2, MemBias); // Clang doesn't yet support packed types. #ifdef __hlsl_dx_compiler @@ -1366,7 +1366,7 @@ in the [`DXILComponentType` enumeration](#dxil-enumerations). ## Appendix 2: HLSL Header -[Compiler Explorer](https://godbolt.org/z/MG55ahKTE) +[Compiler Explorer](https://godbolt.org/z/zfK5WKoYP) > Note: this mostly works with Clang, but has some issues to work out still. ```cpp @@ -1712,21 +1712,21 @@ ByteAddressBuffer MBuf : register(t0); void CoopVec() { using namespace dx::linalg; - using MatrixBTy = + using MatrixATy = Matrix; vector Vec = (vector)0; - MatrixBTy MatB = MatrixBTy::Load( + MatrixATy MatA = MatrixATy::Load( MBuf, 0, /* Row stride = number of columns * element size */ 16 * 4, MatrixLayout::RowMajor); - vector Layer1 = Multiply(MatB, Vec); + vector Layer1 = Multiply(MatA, Vec); vector NullBias = (vector)0; - vector Layer2 = MultiplyAdd(MatB, Layer1, NullBias); + vector Layer2 = MultiplyAdd(MatA, Layer1, NullBias); VectorRef MemBias = {MBuf, /*start offset*/ 4096}; - vector Layer3 = MultiplyAdd(MatB, Layer2, MemBias); + vector Layer3 = MultiplyAdd(MatA, Layer2, MemBias); // Clang doesn't yet support packed types. #ifdef __hlsl_dx_compiler