improve reduction logic (#840)

akroviakov · web-flow · commit ba6721e84d3b · 2024-08-21T13:59:48.000-05:00
diff --git a/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp
@@ -169,7 +169,7 @@ llvm::SmallVector<mlir::Value> lowerInnerReductionWithIntraVectorShuffles(
 
   // Stage 1: vector<ixjx1xnxf16> equals to a grid of ixj of vector<1xnxf16>
   // after lowering to xegpu. This stage performs j-1 reduction operations on
-  // j dim of the grid, the result is a vector of vector<mxnxf16> with size i.
+  // j dim of the grid, the result is a vector of vector<ixnxf16>.
   llvm::SmallVector<mlir::Value> intermediates(shape[0]);
   for (auto i = 0; i < shape[0]; i++) {
     auto combiningVal = sources[i * shape[1]];
@@ -191,14 +191,13 @@ llvm::SmallVector<mlir::Value> lowerInnerReductionWithIntraVectorShuffles(
   // v2 = [b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ... b31]
   // ...
   // vn = [p0 p1 p2 p3 p4 p5 p6 p7 p8 p9 ... p31]
-  // it will repeately doing shuffle between two consecutive vectors
-  // v1 and v2, v3 and v4, ..., vn-1 and vn with a block size. Such
-  // that we can get two new vectors. The block size is typically
-  // starts with half of the vector size. For example, for v1 and v2,
-  // it is 16, and we can get:
+  // To reduce it, we repeatedly shuffle halves of two consecutive vectors.
+  // One can view it as: transpose halves of two partial aggregates, reduce
+  // vertically, get 1 vector with reduced halves of two vectors. For example,
+  // for v1 and v2, we get:
   //    nv1 = [a0, .., a15, b0, .., b15]
   //    nv2 = [a16, .., a31, b16, .., b31]
-  // and we then performs nv1 + nv2 (if reduction op is add)
+  //    nv_reduced = reductionOp(nv1,nv2)
   // such that the left half of the vector contains the partial reduction
   // of v1, and the right half contains the partial reduction of v2.
   // and the the number of vectors is reduced by half after one iteration.
@@ -207,12 +206,16 @@ llvm::SmallVector<mlir::Value> lowerInnerReductionWithIntraVectorShuffles(
   // The intermediate result of this stage is an array of vectors with
   // type, e.g., vector<nxf16>, array size is `i/n`. And these vectors
   // will be merged into a single vector with type vector<ixf16>.
-  auto blkSize = shape[3] / 2;
-  while (blkSize) {
+
+  // each row should not have > 1 partial aggregate at the end
+  auto partialRowAggSize{shape[3]};
+  auto numVecsLeft{shape[0]};
+  while (partialRowAggSize != 1 && numVecsLeft != 1) {
+    partialRowAggSize /= 2;
     auto workList = intermediates;
     intermediates.clear();
     assert(workList.size() % 2 == 0 && "The size should be divisible by 2.");
-    auto masks = genShuffleMasks(blkSize, shape[3]);
+    auto masks = genShuffleMasks(partialRowAggSize, shape[3]);
     for (size_t i = 0; i < workList.size(); i += 2) {
       auto v1 = workList[i];
       auto v2 = workList[i + 1];
@@ -224,7 +227,28 @@ llvm::SmallVector<mlir::Value> lowerInnerReductionWithIntraVectorShuffles(
           createBinOp(kind, shuffleOp1, shuffleOp2, elemTy, loc, rewriter);
       intermediates.push_back(reductionVal);
     }
-    blkSize /= 2;
+    numVecsLeft /= 2;
+  }
+
+  if (partialRowAggSize > 1) {
+    assert(intermediates.size() == 1 &&
+           "We must have ONE row with non-finalized aggregates.");
+    auto toFinalize = intermediates.back();
+    intermediates.clear();
+    uint32_t currentAggVecSize = shape[3];
+    do {
+      currentAggVecSize /= 2;
+      partialRowAggSize /= 2;
+      auto [vecUpperMask, vecLowerMask] =
+          genShuffleMasks(partialRowAggSize, currentAggVecSize);
+      auto shuffleOp1 = rewriter.create<mlir::vector::ShuffleOp>(
+          loc, toFinalize, toFinalize, vecUpperMask);
+      auto shuffleOp2 = rewriter.create<mlir::vector::ShuffleOp>(
+          loc, toFinalize, toFinalize, vecLowerMask);
+      toFinalize =
+          createBinOp(kind, shuffleOp1, shuffleOp2, elemTy, loc, rewriter);
+    } while (partialRowAggSize != 1);
+    intermediates.push_back(toFinalize);
   }
   return intermediates;
 }
diff --git a/test/Conversion/XeTileToXeGPU/reduction.mlir b/test/Conversion/XeTileToXeGPU/reduction.mlir
@@ -186,6 +186,47 @@ module {
       gpu.return
     }
 
+    gpu.func @inner_reduction_1(%a: memref<8x32xf32>, %b: memref<8x1xf32>) {
+      %c0 = arith.constant 0 : index
+      %neg_inf = arith.constant dense<0xFF800000> : vector<8xf32> // -inf
+
+      %a_tile = xetile.init_tile %a[%c0, %c0] : memref<8x32xf32> -> !xetile.tile<8x32xf32>
+      %b_tile = xetile.init_tile %b[%c0, %c0] : memref<8x1xf32> -> !xetile.tile<8x1xf32>
+
+      //CHECK: xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16xf32>
+      //CHECK: xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16xf32>
+      %a_loaded = xetile.load_tile %a_tile: !xetile.tile<8x32xf32> -> vector<8x32xf32>
+
+      //CHECK: %[[R1:.*]] = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R2:.*]] = vector.shuffle %{{.*}}, %{{.*}} [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R3:.*]] = arith.maximumf %[[R1]], %[[R2]] : vector<16xf32>
+      //CHECK: %[[R4:.*]] = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R5:.*]] = vector.shuffle %{{.*}}, %{{.*}} [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R6:.*]] = arith.maximumf %[[R4]], %[[R5]] : vector<16xf32>
+      //CHECK: %[[R7:.*]] = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R8:.*]] = vector.shuffle %{{.*}}, %{{.*}} [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R9:.*]] = arith.maximumf %[[R7]], %[[R8]] : vector<16xf32>
+      //CHECK: %[[R10:.*]] = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R11:.*]] = vector.shuffle %{{.*}}, %{{.*}} [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R12:.*]] = arith.maximumf %[[R10]], %[[R11]] : vector<16xf32>
+      //CHECK: %[[R13:.*]] = vector.shuffle %[[R3]], %[[R6]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R14:.*]] = vector.shuffle %[[R3]], %[[R6]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R15:.*]] = arith.maximumf %[[R13]], %[[R14]] : vector<16xf32>
+      //CHECK: %[[R16:.*]] = vector.shuffle %[[R9]], %[[R12]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R17:.*]] = vector.shuffle %[[R9]], %[[R12]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R18:.*]] = arith.maximumf %[[R16]], %[[R17]] : vector<16xf32>
+      //CHECK: %[[R19:.*]] = vector.shuffle %[[R15]], %[[R18]] [0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R20:.*]] = vector.shuffle %[[R15]], %[[R18]] [2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R21:.*]] = arith.maximumf %[[R19]], %[[R20]] : vector<16xf32>
+      //CHECK: %[[R22:.*]] = vector.shuffle %[[R21]], %[[R21]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R23:.*]] = vector.shuffle %[[R21]], %[[R21]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xf32>, vector<16xf32>
+      //CHECK: %[[R24:.*]] = arith.maximumf %[[R22]], %[[R23]] : vector<8xf32>
+      %3 = vector.multi_reduction <maximumf>, %a_loaded, %neg_inf [1] : vector<8x32xf32> to vector<8xf32> // fastmath<nnan> is implicit here
+      %reduced = vector.shape_cast %3 : vector<8xf32> to vector<8x1xf32>
+      xetile.store_tile %reduced, %b_tile : vector<8x1xf32>, !xetile.tile<8x1xf32>
+      gpu.return
+    }
+
     //CHECK: gpu.func @outter_reduction(%[[arg0:.*]]: memref<128x256xf16>, %[[arg1:.*]]: memref<128x256xf16>) {
     gpu.func @outter_reduction(%a: memref<128x256xf16>, %b: memref<128x256xf16>) {
       //CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<32xf16>