[LoadStoreOpToLLVM] Fix load with base height == 1 (#4602)

whitneywhtsang · web-flow · commit 21c232adee8d · 2025-07-03T16:51:56.000+02:00
When `strides[0]` is 0, we only want to load the first row, so we set
the base height to be 1. (&lt;= done in another PR)
When base height is less than tile height and base height is 1, only the
first row contain valid data.
To ensure the entire tile is filled with valid data, we must replicate
the first row throughout the tile.

---------

Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/test/TritonIntelGPU/tensor-pointer-load-block-2d.mlir b/test/TritonIntelGPU/tensor-pointer-load-block-2d.mlir
@@ -246,3 +246,48 @@ module attributes {ttig.support_sg_2d_block, "ttg.num-warps" = 8 : i32} {
     tt.return
   }
 }
+
+// -----
+
+// COM: Check codegen when base height is 1 and tile height is > 1.
+#mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [4, 2], repCluster = [2, 1], A = [16, 8], B = [8, 16], C = [16, 16]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32, ttig.support_sg_2d_block} {
+  // CHECK-LABEL: @baseheight1
+  tt.func public @baseheight1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
+    %18 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>}>>
+    %19 = tt.expand_dims %18 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>}>> -> tensor<1x32xi32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+    %20 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1x32x!tt.ptr<f32>, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+    %21 = tt.addptr %20, %19 : tensor<1x32x!tt.ptr<f32>, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>, tensor<1x32xi32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+    %22 = tt.broadcast %21 : tensor<1x32x!tt.ptr<f32>, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> -> tensor<64x32x!tt.ptr<f32>, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+    %50 = tt.load %22 {ttig.block_io = "row_major"} : tensor<64x32x!tt.ptr<f32>, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+    // CHECK: [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK: [[LOAD:%.*]] = triton_gen.2Dblockload %{{.*}}, %{{.*}}, [[C1]], %{{.*}}, %{{.*}}, %{{.*}} {elem_size_in_bits = 32, tile_width = 8, tile_height = 16, v_blocks = 2
+
+    // CHECK: [[VEC:%.*]] = llvm.mlir.undef : vector<2xi32>
+
+    // CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: [[OLDVAL:%.*]] = llvm.extractelement [[LOAD]][[[C0]] : i32] : vector<16xi32>
+    // CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: [[THREADID_i64:%.*]] = llvm.call spir_funccc @_Z12get_local_idj([[C0]])
+    // CHECK: [[THREADID:%.*]] = llvm.trunc [[THREADID_i64]] : i64 to i32
+    // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK: [[REM:%.*]] = llvm.urem [[THREADID]], [[C8]] : i32
+    // CHECK: [[NEWVAL:%.*]] = llvm.call spir_funccc @_Z17sub_group_shuffleij([[OLDVAL]], [[REM]])
+    // CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: [[VEC1:%.*]] = llvm.insertelement [[NEWVAL]], [[VEC]][[[C0]] : i32] : vector<2xi32>
+
+    // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK: [[OLDVAL:%.*]] = llvm.extractelement [[LOAD]][[[C8]] : i32] : vector<16xi32>
+    // CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: [[THREADID_i64:%.*]] = llvm.call spir_funccc @_Z12get_local_idj([[C0]])
+    // CHECK: [[THREADID:%.*]] = llvm.trunc [[THREADID_i64]] : i64 to i32
+    // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK: [[REM:%.*]] = llvm.urem [[THREADID]], [[C8]] : i32
+    // CHECK: [[NEWVAL:%.*]] = llvm.call spir_funccc @_Z17sub_group_shuffleij([[OLDVAL]], [[REM]])
+    // CHECK: [[C1:%.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK: [[VEC2:%.*]] = llvm.insertelement [[NEWVAL]], [[VEC1]][[[C1]] : i32] : vector<2xi32>
+
+    // CHECK: llvm.shufflevector [[VEC2]], [[VEC2]] [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1283,7 +1283,8 @@ struct LoadOpToBlockIOConversion
 
     // If the stride is 0, we want to load only the first row.
     int stride = getStride(ptr, 0);
-    Value baseHeight = b.i32_val(stride == 0 ? 1 : tileHeight);
+    unsigned baseHeightInt = (stride == 0 ? 1 : tileHeight);
+    Value baseHeight = b.i32_val(baseHeightInt);
 
     StringAttr kRegister = str_attr("register");
     StringAttr kLane = str_attr("lane");
@@ -1380,6 +1381,48 @@ struct LoadOpToBlockIOConversion
                 (usePackedType && opIdx == DpasEncodingAttr::OpIdx::OperandB &&
                  !isTransposeRequired && originalElemBits != 32));
 
+            // When strides[0] is 0, we only want to load the first row, so we
+            // set the base height to be 1. If tile height is bigger than 1,
+            // then only the first row contain valid data. To ensure the entire
+            // tile is filled with valid data, we must replicate the first row
+            // throughout the tile.
+            if (baseHeightInt < tileHeight && baseHeightInt == 1) {
+              unsigned numIndicesPerMatrix = numValuesPerLoad / vBlocks;
+              SmallVector<int32_t> shuffleIndices(numValuesPerLoad);
+
+              // Create a vector to store the data of the first index of each
+              // matrix.
+              VectorType vecTy = vec_ty(loadResultElemType, vBlocks);
+              Value firstIndexVec = b.undef(vecTy);
+
+              for (unsigned valueIndex = 0; valueIndex < numValuesPerLoad;
+                   ++valueIndex) {
+                unsigned firstIndexVecIdx = valueIndex / numIndicesPerMatrix;
+                // Handle case where an index spans two rows.
+                if (valueIndex % numIndicesPerMatrix == 0) {
+                  Value oldVal = b.extract_element(ret, b.i32_val(valueIndex));
+                  Value newVal = oldVal;
+                  if (tileWidth < threadsPerWarp) {
+                    assert(tileWidth * 2 == threadsPerWarp &&
+                           "Expecting tileWidth to be 2x threadsPerWarp");
+                    Value threadId = getThreadId(rewriter, loc);
+                    newVal = targetInfo.shuffleIdx(
+                        rewriter, loc, oldVal,
+                        b.urem(threadId, b.i32_val(tileWidth)));
+                  }
+                  firstIndexVec =
+                      b.insert_element(firstIndexVec.getType(), firstIndexVec,
+                                       newVal, b.i32_val(firstIndexVecIdx));
+                }
+
+                shuffleIndices[valueIndex] = firstIndexVecIdx;
+              }
+              DenseI32ArrayAttr attr =
+                  rewriter.getDenseI32ArrayAttr(shuffleIndices);
+              ret = rewriter.create<LLVM::ShuffleVectorOp>(
+                  loc, load2DGenXType, firstIndexVec, firstIndexVec, attr);
+            }
+
             if (others.size()) {
               assert(masks.size() == others.size() &&
                      "The mask value has to be provided when "