refine descriptions and rename operations

Hsiangkai · Hsiangkai · commit c7269d3948b6 · 2025-05-23T09:44:47.000+01:00
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1919,7 +1919,7 @@ def GPU_SubgroupMmaConstantMatrixOp : GPU_Op<"subgroup_mma_constant_matrix",
   }];
 }
 
-def GPU_SubgroupMmaExtractOp : GPU_Op<"subgroup_mma_extract",
+def GPU_SubgroupMmaExtractThreadLocalOp : GPU_Op<"subgroup_mma_extract_thread_local",
     [Pure,
      TypesMatchWith<"value type matches element type of mma_matrix",
                     "matrix", "res",
@@ -1928,20 +1928,28 @@ def GPU_SubgroupMmaExtractOp : GPU_Op<"subgroup_mma_extract",
   let summary = "Extract a value from GPU warp by invocation and indices";
 
   let description = [{
-    The `gpu.subgroup_mma_extract` operation extracts a value from `!gpu.mma_matrix`
-    by the invocation in a subgroup.
+    The `gpu.subgroup_mma_extract_thread_local` operation extracts a value from `!gpu.mma_matrix`
+    that is stored at subgroup level.
 
     This operation takes `!gpu.mma_matrix` as its first operand. It is the source
     matrix across a subgroup. The op returns a scalar value stored in the invocation
-    in the subgroup. The values of !gpu.mma_matrix are stored across multiple
-    threads in the subgroup. If there are multiple values packed in a thread, use
-    `indices` to specify the element in the local thread to extract.
+    in the subgroup.
+
+    Since `matrix` is packed into the the threads within a subgroup, `indices` are
+    the indices into the values stored by each thread. That is, an index of 0 (or [0, 0])
+    does not necessarily refer to the first element of the matrix, but the first element
+    that a particular thread holds.
+
+    The mapping of matrix elements to threads is not defined by this operation and may
+    not be defined by some lowerings (such as the lowering to SPIR-V). However, if the
+    size of the subgroup is S, then `subgroup_mma_extract_thread_local` at each index in
+    `[0, (M * N) / S)` will have the entire matrix extracted across the subgroup.
 
     Example:
 
     ```mlir
     %c0 = arith.constant 0 : index
-    %val = gpu.subgroup_mma_extract %m[%c0] : !gpu.mma_matrix<16x16xf32, "AOp"> -> f32
+    %val = gpu.subgroup_mma_extract_thread_local %m[%c0] : !gpu.mma_matrix<16x16xf32, "AOp"> -> f32
     ```
   }];
 
@@ -1954,7 +1962,7 @@ def GPU_SubgroupMmaExtractOp : GPU_Op<"subgroup_mma_extract",
   }];
 }
 
-def GPU_SubgroupMmaInsertOp : GPU_Op<"subgroup_mma_insert",
+def GPU_SubgroupMmaInsertThreadLocalOp : GPU_Op<"subgroup_mma_insert_thread_local",
     [Pure,
      TypesMatchWith<"value type matches element type of mma_matrix",
                     "matrix", "value",
@@ -1963,23 +1971,29 @@ def GPU_SubgroupMmaInsertOp : GPU_Op<"subgroup_mma_insert",
   let summary = "Insert a value into GPU warp by invocation and indices";
 
   let description = [{
-    The `gpu.subgroup_mma_insert` operation inserts a value to `!gpu.mma_matrix`
-    by the invocation in a subgroup.
+    The `gpu.subgroup_mma_insert_thread_local` operation inserts a value to `!gpu.mma_matrix`
+    that is stored at subgroup level.
 
     This operation takes scalar value as its first operand and `!gpu.mma_matrix`
-    as its second operand. It is the matrix across a subgroup. The op inserts the
-    scalar value stored in the invocation in the subgroup to the matrix. The values
-    of !gpu.mma_matrix are stored across multiple threads in the subgroup. If there
-    are multiple values packed in an invocation, use `indices` to specify the
-    location to insert in the packing.
+    as its second operand. The op inserts the scalar value to the matrix.
+
+    Since `matrix` is packed into the the threads within a subgroup, `indices` are
+    the indices into the values stored by each thread. That is, an index of 0 (or [0, 0])
+    does not necessarily refer to the first element of the matrix, but the first element
+    that a particular thread holds.
+
+    The mapping of matrix elements to threads is not defined by this operation and may
+    not be defined by some lowerings (such as the lowering to SPIR-V). However, if the
+    size of the subgroup is S, then `subgroup_mma_insert_thread_local` at each index in
+    `[0, (M * N) / S)` will have the entire matrix inserted across the subgroup.
 
     The op returns `!gpu.mma_matrix` with the updated value.
 
     Example:
 
     ```mlir
     %c0 = arith.constant 0 : index
-    %s0 = gpu.subgroup_mma_insert %val, %m[%c0] : f16, !gpu.mma_matrix<16x16xf16, "COp">
+    %s0 = gpu.subgroup_mma_insert_thread_local %val, %m[%c0] : f16, !gpu.mma_matrix<16x16xf16, "COp">
             -> !gpu.mma_matrix<16x16xf16, "COp">
     ```
   }];
diff --git a/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp
@@ -114,11 +114,11 @@ struct WmmaConstantOpToSPIRVLowering final
 /// Converts GPU MMA ExtractOp to CompositeExtract SPIR-V KHR/NV cooperative
 /// matrix ops.
 struct WmmaExtractOpToSPIRVLowering final
-    : OpConversionPattern<gpu::SubgroupMmaExtractOp> {
+    : OpConversionPattern<gpu::SubgroupMmaExtractThreadLocalOp> {
   using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(gpu::SubgroupMmaExtractOp op, OpAdaptor adaptor,
+  matchAndRewrite(gpu::SubgroupMmaExtractThreadLocalOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Value matrix = adaptor.getMatrix();
     auto coopType =
@@ -146,11 +146,11 @@ struct WmmaExtractOpToSPIRVLowering final
 /// Converts GPU MMA InsertOp to CompositeInsert SPIR-V KHR/NV cooperative
 /// matrix ops.
 struct WmmaInsertOpToSPIRVLowering final
-    : OpConversionPattern<gpu::SubgroupMmaInsertOp> {
+    : OpConversionPattern<gpu::SubgroupMmaInsertThreadLocalOp> {
   using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(gpu::SubgroupMmaInsertOp op, OpAdaptor adaptor,
+  matchAndRewrite(gpu::SubgroupMmaInsertThreadLocalOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Value value = adaptor.getValue();
     Value matrix = adaptor.getMatrix();
diff --git a/mlir/test/Conversion/GPUToSPIRV/wmma-ops-to-spirv-khr-coop-matrix.mlir b/mlir/test/Conversion/GPUToSPIRV/wmma-ops-to-spirv-khr-coop-matrix.mlir
@@ -93,28 +93,28 @@ module attributes {
       gpu.return
     }
 
-    // CHECK-LABEL: spirv.func @gpu_wmma_extract_op
+    // CHECK-LABEL: spirv.func @gpu_wmma_extract_thread_local_op
     // CHECK-SAME: %[[ARG0:.+]]: !spirv.coopmatrix<16x16xf32, Subgroup, MatrixA>
-    gpu.func @gpu_wmma_extract_op(%m: !gpu.mma_matrix<16x16xf32, "AOp">,
+    gpu.func @gpu_wmma_extract_thread_local_op(%m: !gpu.mma_matrix<16x16xf32, "AOp">,
                                   %ptr: memref<16x16xf32, #spirv.storage_class<StorageBuffer>>) kernel
       attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 4, 1]>} {
       // CHECK: spirv.CompositeExtract %[[ARG0]][0 : i32] : !spirv.coopmatrix<16x16xf32, Subgroup, MatrixA>
       %c0 = arith.constant 0 : index
-      %val = gpu.subgroup_mma_extract %m[%c0] : !gpu.mma_matrix<16x16xf32, "AOp"> -> f32
+      %val = gpu.subgroup_mma_extract_thread_local %m[%c0] : !gpu.mma_matrix<16x16xf32, "AOp"> -> f32
       memref.store %val, %ptr[%c0, %c0] : memref<16x16xf32, #spirv.storage_class<StorageBuffer>>
       gpu.return
     }
 
-    // CHECK-LABEL: spirv.func @gpu_wmma_insert_op
+    // CHECK-LABEL: spirv.func @gpu_wmma_insert_thread_local_op
     // CHECK-SAME: %[[ARG0:.+]]: f16
     // CHECK-SAME: %[[ARG1:.+]]: !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc>
-    gpu.func @gpu_wmma_insert_op(%val: f16,
+    gpu.func @gpu_wmma_insert_thread_local_op(%val: f16,
                                  %m: !gpu.mma_matrix<16x16xf16, "COp">,
                                  %ptr: memref<16x16xf16, #spirv.storage_class<StorageBuffer>>) kernel
       attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 4, 1]>} {
       // CHECK: spirv.CompositeInsert %[[ARG0]], %[[ARG1]][0 : i32] : f16 into !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc>
       %c0 = arith.constant 0 : index
-      %s0 = gpu.subgroup_mma_insert %val, %m[%c0] : f16, !gpu.mma_matrix<16x16xf16, "COp"> -> !gpu.mma_matrix<16x16xf16, "COp">
+      %s0 = gpu.subgroup_mma_insert_thread_local %val, %m[%c0] : f16, !gpu.mma_matrix<16x16xf16, "COp"> -> !gpu.mma_matrix<16x16xf16, "COp">
       gpu.subgroup_mma_store_matrix %s0, %ptr[%c0,%c0] {leadDimension = 16 : index} :
         !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16, #spirv.storage_class<StorageBuffer>>
       gpu.return
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
@@ -436,11 +436,11 @@ module attributes {gpu.container_module} {
                                 %ptr: memref<16x16xf32>) {
     %zero = arith.constant 0.0 : f32
     %c0 = arith.constant 0 : index
-    // CHECK: gpu.subgroup_mma_extract
-    %val = gpu.subgroup_mma_extract %src[%c0] : !gpu.mma_matrix<16x16xf32, "COp"> -> f32
+    // CHECK: gpu.subgroup_mma_extract_thread_local
+    %val = gpu.subgroup_mma_extract_thread_local %src[%c0] : !gpu.mma_matrix<16x16xf32, "COp"> -> f32
     %m = gpu.subgroup_mma_constant_matrix %zero : !gpu.mma_matrix<16x16xf32, "COp">
-    // CHECK: gpu.subgroup_mma_insert
-    %s0 = gpu.subgroup_mma_insert %val, %m[%c0] : f32, !gpu.mma_matrix<16x16xf32, "COp"> -> !gpu.mma_matrix<16x16xf32, "COp">
+    // CHECK: gpu.subgroup_mma_insert_thread_local
+    %s0 = gpu.subgroup_mma_insert_thread_local %val, %m[%c0] : f32, !gpu.mma_matrix<16x16xf32, "COp"> -> !gpu.mma_matrix<16x16xf32, "COp">
     gpu.subgroup_mma_store_matrix %s0, %ptr[%c0, %c0] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<16x16xf32>
     return
   }