iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp‎
Lines changed: 86 additions & 11 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp‎
Lines changed: 86 additions & 11 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h‎
Lines changed: 1 addition & 1 deletion b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir‎
Lines changed: 139 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_mask.mlir‎
Lines changed: 7 additions & 7 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_mask.mlir‎
Lines changed: 7 additions & 7 deletions
@@ -10,8 +10,10 @@
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "iree/compiler/Utils/Indexing.h"
 #include "iree/compiler/Utils/Permutation.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -422,9 +424,70 @@ struct DistributeTransferWrite final
   using OpDistributionPattern::OpDistributionPattern;
 
   DistributeTransferWrite(MLIRContext *context, Value threadId,
-                          int64_t subgroupSize)
+                          int64_t subgroupSize, ArrayRef<int64_t> workgroupSize)
       : OpDistributionPattern(context), threadId(threadId),
-        subgroupSize(subgroupSize) {}
+        subgroupSize(subgroupSize) {
+
+    // The number of threads in the workgroup is the product of the dimensions
+    // of workgroupSize, unless workgroupSize is empty.
+    if (!workgroupSize.empty()) {
+      numThreadsInWorkgroup = llvm::product_of(workgroupSize);
+    }
+  }
+
+  /// Compute a boolean in SIMT semantics that is true for the first virtual
+  /// lane(thread) id (vtid) and virtual subgroup id (vsid) carrying broadcasted
+  /// data.
+  ///
+  /// We do this by computing a basis for vtid and vsid computation, and adding
+  /// a check for basis elements that are not used (i.e. they are duplicated)
+  /// to be zero.
+  FailureOr<Value> getNoOverlapCondition(OpBuilder &b, Location loc,
+                                         NestedLayoutAttr layout) const {
+    ArrayRef<int64_t> threadTile = layout.getThreadTile();
+    ArrayRef<int64_t> threadStrides = layout.getThreadStrides();
+    ArrayRef<int64_t> subgroupTile = layout.getSubgroupTile();
+    // Multiply the subgroup strides by subgroup_size to reflect thread id
+    // relative strides.
+    auto subgroupStrides =
+        llvm::map_to_vector(layout.getSubgroupStrides(),
+                            [&](int64_t x) { return x * subgroupSize; });
+    auto concatTiles =
+        llvm::to_vector(llvm::concat<const int64_t>(subgroupTile, threadTile));
+    auto concatStrides = llvm::to_vector(
+        llvm::concat<const int64_t>(subgroupStrides, threadStrides));
+    SmallVector<int64_t> basis;
+    SmallVector<size_t> dimToResult;
+    if (failed(basisFromSizesStrides(concatTiles, concatStrides, basis,
+                                     dimToResult))) {
+      return failure();
+    }
+    // Make the outer bound numThreadsInWorkgroup / prod(basis) to remove
+    // redundant checks.
+    if (numThreadsInWorkgroup.has_value()) {
+      int64_t outerBound =
+          numThreadsInWorkgroup.value() / llvm::product_of(basis);
+      basis.insert(basis.begin(), outerBound);
+    }
+    // Create a delinearize operation and check that all results not present in
+    // dimToResult are 0.
+    SmallVector<Value> delinearized;
+    b.createOrFold<affine::AffineDelinearizeIndexOp>(
+        delinearized, loc, threadId, basis,
+        /*hasOuterbound=*/numThreadsInWorkgroup.has_value());
+    // Get all results which are not in dimToResult and check they are 0.
+    Value condition = arith::ConstantOp::create(b, loc, b.getBoolAttr(true));
+    for (auto [idx, result] : llvm::enumerate(delinearized)) {
+      if (llvm::is_contained(dimToResult, idx)) {
+        continue;
+      }
+      Value isZero = b.createOrFold<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::eq, result,
+          arith::ConstantIndexOp::create(b, loc, 0));
+      condition = b.createOrFold<arith::AndIOp>(loc, condition, isZero);
+    }
+    return condition;
+  }
 
   LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
                                 DistributionSignature &signature,
@@ -456,7 +519,6 @@ struct DistributeTransferWrite final
     SmallVector<int64_t> distShape = vectorLayout.getDistributedShape();
     SmallVector<int64_t> tileShape = getElementVectorTileShape(vectorLayout);
     int64_t rank = vectorLayout.getRank();
-
     SmallVector<Value> warpIndices, threadIndices;
     if (failed(populateWarpAndThreadIndices(rewriter, threadId, subgroupSize,
                                             vectorLayout, warpIndices,
@@ -465,6 +527,18 @@ struct DistributeTransferWrite final
           writeOp, "warp or thread tiles have overlapping strides");
     }
 
+    // If the distribution results in threads writing to the same address, guard
+    // with an scf.if to ensure only one thread writes per duplication group.
+    Location loc = writeOp.getLoc();
+    FailureOr<Value> doWrite =
+        getNoOverlapCondition(rewriter, loc, vectorLayout);
+    if (failed(doWrite)) {
+      return rewriter.notifyMatchFailure(
+          writeOp, "failed to compute no-overlap condition");
+    }
+    auto ifOp = scf::IfOp::create(rewriter, loc, doWrite.value());
+    rewriter.setInsertionPoint(ifOp.thenYield());
+
     Value distributedVector =
         getDistributed(rewriter, writeOp.getValueToStore(), vectorLayout);
 
@@ -485,7 +559,6 @@ struct DistributeTransferWrite final
       SmallVector<Value> slicedIndices = getTransferIndicesFromNestedLayout(
           rewriter, indices, offsets, vectorLayout, permMap, warpIndices,
           threadIndices);
-
       // Extract the "element vector" from the inner most dimensions. All outer
       // dimensions are either unrolled or distributed such that this is a
       // contiguous slice.
@@ -516,6 +589,7 @@ struct DistributeTransferWrite final
 
   Value threadId;
   int64_t subgroupSize;
+  std::optional<int64_t> numThreadsInWorkgroup = std::nullopt;
 };
 
 /// Pattern to distribute `vector.transfer_gather` ops with nested layouts.
@@ -2127,13 +2201,14 @@ struct DistributeConstantMask final
 
 } // namespace
 
-void populateGPUDistributeNestedLayoutAttrPatterns(RewritePatternSet &patterns,
-                                                   Value threadId,
-                                                   int64_t subgroupSize,
-                                                   int64_t maxBitsPerShuffle) {
-  patterns.add<DistributeTransferRead, DistributeTransferWrite,
-               DistributeTransferGather, DistributeMapScatter>(
-      patterns.getContext(), threadId, subgroupSize);
+void populateGPUDistributeNestedLayoutAttrPatterns(
+    RewritePatternSet &patterns, Value threadId, int64_t subgroupSize,
+    ArrayRef<int64_t> workgroupSize, int64_t maxBitsPerShuffle) {
+  patterns.add<DistributeTransferRead, DistributeTransferGather,
+               DistributeMapScatter>(patterns.getContext(), threadId,
+                                     subgroupSize);
+  patterns.add<DistributeTransferWrite>(patterns.getContext(), threadId,
+                                        subgroupSize, workgroupSize);
   patterns.add<DistributeBroadcast, DistributeTranspose>(patterns.getContext());
   patterns.add<DistributeMultiReduction>(patterns.getContext(), subgroupSize,
                                          maxBitsPerShuffle);
 
@@ -33,7 +33,7 @@ void populateGPUDistributionPatterns(RewritePatternSet &patterns);
 
 void populateGPUDistributeNestedLayoutAttrPatterns(
     RewritePatternSet &patterns, Value threadId, int64_t subgroupSize,
-    int64_t maxBitsPerShuffle = 32);
+    ArrayRef<int64_t> workgroupSize, int64_t maxBitsPerShuffle = 32);
 
 // Adds patterns that distributes vector.contract ops with nested layout
 // annotations to amdgpu.mfma ops.
 
@@ -1409,3 +1409,142 @@ builtin.module attributes { transform.with_named_sequence } {
 //   CHECK-DAG:       %[[DISTRIBUTED_IDX1:.+]] = arith.addi %[[IDX1]], %[[C8]]
 //       CHECK:       iree_linalg_ext.yield %[[DISTRIBUTED_IDX0]], %[[DISTRIBUTED_IDX1]]
 //       CHECK:     : vector<1x8xf16> into memref<64x64xf16>
+
+// -----
+
+// Check that only the first lane of the first subgroup writes when the threads
+// are completely undistributed (all threads write to same address).
+// CHECK-LABEL: @undistributed_write
+func.func @undistributed_write(%out: memref<f32, #amdgpu.address_space<fat_raw_buffer>>, %v: vector<f32>) {
+  //  CHECK-DAG: %[[ZERO:.*]] = arith.constant 0 : index
+  //  CHECK-DAG: %[[TID:.*]] = gpu.thread_id  x
+  //  CHECK-DAG: %[[COND:.+]] = arith.cmpi eq, %[[TID]], %[[ZERO]] : index
+  // CHECK-NEXT: scf.if %[[COND]] {
+  //      CHECK:   vector.transfer_write
+  // CHECK-NEXT: }
+  vector.transfer_write %v, %out[] : vector<f32>, memref<f32, #amdgpu.address_space<fat_raw_buffer>>
+  return
+}
+
+builtin.module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+#layout_row_major = #iree_vector_ext.nested_layout<
+  subgroup_tile    = [4, 1],
+  batch_tile       = [1, 1],
+  outer_tile       = [1, 1],
+  thread_tile      = [2, 8],
+  element_tile     = [1, 2],
+  subgroup_strides = [1, 1],
+  thread_strides   = [32, 1]
+>
+
+// subgroup_size = 64 (default for the transform test_gpu_vector_distribution)
+// A possible thread basis for this distribution would be:
+// thread_basis = [2, 4, 8] and the dimension with size "4" has data broadcasted
+// across all threads (note the thread strides). This test checks if we account
+// for such broadcasts when generating conditional writes.
+// CHECK-LABEL: @partially_distributed_write
+//   CHECK-DAG:    %[[TID:.+]] = gpu.thread_id  x
+//   CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+//       CHECK:    %[[DELIN:.*]]:5 = affine.delinearize_index %[[TID:.+]] into (4, 2, 4, 8)
+//   CHECK-DAG:    %[[SUBGROUP_COND:.+]] = arith.cmpi eq, %[[DELIN]]#0, %[[C0]] : index
+//   CHECK-DAG:    %[[LANE_COND:.+]] = arith.cmpi eq, %[[DELIN]]#3, %[[C0]] : index
+//       CHECK:    %[[COND:.+]] = arith.andi %[[SUBGROUP_COND]], %[[LANE_COND]]
+//       CHECK:    scf.if %[[COND]] {
+//       CHECK:        vector.transfer_write
+func.func @partially_distributed_write(%out: memref<100x100xf32, #amdgpu.address_space<fat_raw_buffer>>, %v: vector<8x16xf32>) {
+  %w = iree_vector_ext.to_layout %v to layout(#layout_row_major) : vector<8x16xf32>
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %w, %out[%c0, %c0]
+          {in_bounds = [true, true]}
+  : vector<8x16xf32>, memref<100x100xf32, #amdgpu.address_space<fat_raw_buffer>>
+  func.return
+}
+
+builtin.module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// In this example, threads with the same lane write to the same address. We check that only the first subgroup writes.
+// i.e. threads in [0, 64) will write, threads in [64, 256) will not write.
+#layout_row_major = #iree_vector_ext.nested_layout<
+  subgroup_tile    = [1, 1],
+  batch_tile       = [1, 1],
+  outer_tile       = [1, 1],
+  thread_tile      = [1, 64],
+  element_tile     = [64, 1],
+  subgroup_strides = [1, 1],
+  thread_strides   = [1, 1]
+>
+
+// CHECK-LABEL: @lanes_fully_distributed
+//   CHECK-DAG:    %[[TID:.+]] = gpu.thread_id
+//   CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+//       CHECK:    %[[DELIN:.*]]:2 = affine.delinearize_index %[[TID:.+]] into (4, 64)
+//       CHECK:    %[[COND:.+]] = arith.cmpi eq, %[[DELIN]]#0, %[[C0]] : index
+//       CHECK:    scf.if %[[COND]] {
+//       CHECK:        vector.transfer_write
+func.func @lanes_fully_distributed(%out: memref<100x100xf32, #amdgpu.address_space<fat_raw_buffer>>, %v: vector<64x64xf32>) {
+  %w = iree_vector_ext.to_layout %v to layout(#layout_row_major) : vector<64x64xf32>
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %w, %out[%c0, %c0]
+          {in_bounds = [true, true]}
+  : vector<64x64xf32>, memref<100x100xf32, #amdgpu.address_space<fat_raw_buffer>>
+  func.return
+}
+
+builtin.module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.iree.test_gpu_vector_distribution %top_level_func {workgroup_size = array<i64: 256, 1, 1>} : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// This example is similar to the above, but now the workgroup only contains 64 threads, so no condition is needed. Confirm there is no condition.
+#layout_row_major = #iree_vector_ext.nested_layout<
+  subgroup_tile    = [1, 1],
+  batch_tile       = [1, 1],
+  outer_tile       = [1, 1],
+  thread_tile      = [1, 64],
+  element_tile     = [64, 1],
+  subgroup_strides = [1, 1],
+  thread_strides   = [1, 1]
+>
+
+// CHECK-LABEL: @threads_fully_distributed
+//       CHECK-NOT: scf.if
+//       CHECK: transfer_write
+//       CHECK: return
+func.func @threads_fully_distributed(%out: memref<100x100xf32, #amdgpu.address_space<fat_raw_buffer>>, %v: vector<64x64xf32>) {
+  %w = iree_vector_ext.to_layout %v to layout(#layout_row_major) : vector<64x64xf32>
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %w, %out[%c0, %c0]
+          {in_bounds = [true, true]}
+  : vector<64x64xf32>, memref<100x100xf32, #amdgpu.address_space<fat_raw_buffer>>
+  func.return
+}
+
+builtin.module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.iree.test_gpu_vector_distribution %top_level_func {workgroup_size = array<i64: 64, 1, 1>} : !transform.any_op
+    transform.yield
+  }
+}
@@ -2,9 +2,9 @@
 
 #nested = #iree_vector_ext.nested_layout<
   subgroup_tile = [2, 1],
-  batch_tile = [2, 1],
+  batch_tile = [8, 1],
   outer_tile = [2, 1],
-  thread_tile = [16, 16],
+  thread_tile = [4, 16],
   element_tile = [2, 8],
 
   subgroup_strides = [1, 0],
@@ -34,13 +34,13 @@ builtin.module attributes { transform.with_named_sequence } {
 // CHECK-LABEL: func @masked_read_write
 // CHECK: %[[DIM:.+]] = memref.dim %arg0, %c0 : memref<?x128xf16>
 // CHECK: %[[VSID:.+]]:3 = affine.delinearize_index %thread_id_x into (2, 64) : index, index, index
-// CHECK: %[[VTID:.+]]:3 = affine.delinearize_index %thread_id_x into (16, 16) : index, index, index
+// CHECK: %[[VTID:.+]]:3 = affine.delinearize_index %thread_id_x into (4, 16) : index, index, index
 // CHECK: %[[LASTIDX:.+]] = arith.subi %[[DIM]], %c1 : index
-// CHECK: %[[PACKED_LASTIDX:.+]]:4 = affine.delinearize_index %[[LASTIDX]] into (2, 4, 16, 2) : index, index, index, index
+// CHECK: %[[PACKED_LASTIDX:.+]]:4 = affine.delinearize_index %[[LASTIDX]] into (2, 16, 4, 2) : index, index, index, index
 
-// CHECK: %[[ETILE_VALID:.+]] = affine.linearize_index [%[[PACKED_LASTIDX]]#1, %c1] by (4, 2) : index
+// CHECK: %[[ETILE_VALID:.+]] = affine.linearize_index [%[[PACKED_LASTIDX]]#1, %c1] by (16, 2) : index
 // CHECK: %[[ETILE_VALID_BOUND:.+]] = arith.addi %[[ETILE_VALID]], %c1 : index
-// CHECK: %[[DISTR_LASTIDX:.+]] = affine.linearize_index [%[[PACKED_LASTIDX]]#1, %[[PACKED_LASTIDX]]#3] by (4, 2) : index
+// CHECK: %[[DISTR_LASTIDX:.+]] = affine.linearize_index [%[[PACKED_LASTIDX]]#1, %[[PACKED_LASTIDX]]#3] by (16, 2) : index
 // CHECK: %[[DISTR_BOUND:.+]] = arith.addi %[[DISTR_LASTIDX]], %c1 : index
 
 // CHECK: %[[EQ_BOUND_TID:.+]] = arith.cmpi eq, %[[VTID]]#1, %[[PACKED_LASTIDX]]#2 : index
@@ -50,7 +50,7 @@ builtin.module attributes { transform.with_named_sequence } {
 
 // CHECK: %[[SELTREE0:.+]] = arith.select %[[LT_BOUND_TID]], %[[ETILE_VALID_BOUND]], %c0 : index
 // CHECK: %[[SELTREE1:.+]] = arith.select %[[EQ_BOUND_TID]], %[[DISTR_BOUND]], %[[SELTREE0]] : index
-// CHECK: %[[SELTREE2:.+]] = arith.select %[[LT_BOUND_SID]], %c8, %c0 : index
+// CHECK: %[[SELTREE2:.+]] = arith.select %[[LT_BOUND_SID]], %c32, %c0 : index
 // CHECK: %[[SELTREE3:.+]] = arith.select %[[EQ_BOUND_SID]], %[[SELTREE1]], %[[SELTREE2]] : index
 // CHECK: %[[MASK:.+]] = vector.create_mask %[[SELTREE3]], %c8 : vector<2x8xi1>