address comments

charithaintc · charithaintc · commit 9cefe6fab894 · 2025-06-13T18:08:04.000Z
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp
@@ -444,9 +444,8 @@ void LayoutInfoPropagation::visitStoreNdOp(
     ArrayRef<const LayoutInfoLattice *> results) {
   LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType());
   // Both operands should have the same layout
-  for (LayoutInfoLattice *operand : operands) {
+  for (LayoutInfoLattice *operand : operands)
     propagateIfChanged(operand, operand->meet(storeLayout));
-  }
 }
 
 /// Propagate the layout of the value to the tensor descriptor operand in
@@ -659,20 +658,18 @@ RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
 
   SmallVector<FunctionOpInterface> funcOps;
   if (auto modOp = dyn_cast<ModuleOp>(target)) {
-    for (auto funcOp : modOp.getOps<FunctionOpInterface>()) {
+    for (auto funcOp : modOp.getOps<FunctionOpInterface>())
       funcOps.push_back(funcOp);
-    }
+
     // Collect all GpuFuncOps in the module.
     for (auto gpuModOp : modOp.getOps<gpu::GPUModuleOp>()) {
-      for (auto gpuFuncOp : gpuModOp.getOps<FunctionOpInterface>()) {
+      for (auto gpuFuncOp : gpuModOp.getOps<FunctionOpInterface>())
         funcOps.push_back(gpuFuncOp);
-      }
     }
   }
   // Print the analysis result for each function.
-  for (FunctionOpInterface funcOp : funcOps) {
+  for (FunctionOpInterface funcOp : funcOps)
     printFunctionResult(funcOp);
-  }
 }
 
 using GetLayoutFnTy = function_ref<xegpu::LayoutAttr(Value)>;
@@ -706,7 +703,6 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
     }
     // If the result is a vector type, add a temporary layout attribute to the
     // op.
-    std::string resultLayoutName = xegpu::getLayoutName(result);
     xegpu::setLayoutAttr(result, layout);
   }
 }
@@ -717,6 +713,7 @@ static void updateBranchTerminatorOpInterface(
     mlir::OpBuilder &builder,
     mlir::RegionBranchTerminatorOpInterface terminator,
     GetLayoutFnTy getLayoutOfValue) {
+  // Only process if the terminator is inside a region branch op.
   if (!mlir::isa<mlir::RegionBranchOpInterface>(terminator->getParentOp()))
     return;
 
@@ -729,9 +726,10 @@ static void updateBranchTerminatorOpInterface(
     if (!successor.isParent())
       continue;
 
-    mlir::OperandRange operands = terminator.getSuccessorOperands(successor);
-    mlir::ValueRange inputs = successor.getSuccessorInputs();
-    for (auto [operand, input] : llvm::zip(operands, inputs)) {
+    mlir::OperandRange forwardedOperands =
+        terminator.getSuccessorOperands(successor);
+    mlir::ValueRange regionArgs = successor.getSuccessorInputs();
+    for (auto [operand, input] : llvm::zip(forwardedOperands, regionArgs)) {
       // print arg and inp
       // llvm::errs() << "arg: " << operand << ", inp: " << input << "\n";
       Type inputType = input.getType();
@@ -773,38 +771,43 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder,
   llvm::SmallVector<mlir::RegionSuccessor> successors;
   llvm::SmallVector<mlir::Attribute> operands(op->getNumOperands(), nullptr);
   branch.getEntrySuccessorRegions(operands, successors);
-  DenseMap<Value, xegpu::LayoutAttr> resultToLayouts;
+  DenseMap<Value, xegpu::LayoutAttr>
+      resultToLayouts; // This map keeps track of layouts of any unused results
+                       // of the branch op.
   mlir::ValueRange results = op->getResults();
 
   for (mlir::RegionSuccessor &successor : successors) {
+    // Only interested in successor regions that are contained within the op.
     if (successor.isParent())
       continue;
 
-    mlir::OperandRange operands = branch.getEntrySuccessorOperands(successor);
-    mlir::ValueRange inputs = successor.getSuccessorInputs();
+    mlir::OperandRange forwardedOperands =
+        branch.getEntrySuccessorOperands(successor);
+    mlir::ValueRange regionArgs = successor.getSuccessorInputs();
 
-    for (auto [operand, input, result] : llvm::zip(operands, inputs, results)) {
-      Type inputType = input.getType();
+    for (auto [forwardedOperand, regionArg, result] :
+         llvm::zip(forwardedOperands, regionArgs, results)) {
+      Type inputType = regionArg.getType();
       if (!isa<xegpu::TensorDescType>(inputType))
         continue;
-      xegpu::LayoutAttr inputLayout = getLayoutOfValue(input);
-      xegpu::LayoutAttr operandLayout = getLayoutOfValue(operand);
+      xegpu::LayoutAttr inputLayout = getLayoutOfValue(regionArg);
+      xegpu::LayoutAttr operandLayout = getLayoutOfValue(forwardedOperand);
 
       if (!inputLayout || !operandLayout) {
-        LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << input
-                          << " or init arg: " << operand << "\n");
+        LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << regionArg
+                          << " or init arg: " << forwardedOperand << "\n");
         continue;
       }
 
       // TODO: We expect these two to match.
       assert(inputLayout == operandLayout &&
-             "Expexing block arg and init arg to have the same layout.");
+             "Expecting block arg and init arg to have the same layout.");
       // Get tensor descriptor type with the layout.
       auto tdescTy = dyn_cast<xegpu::TensorDescType>(inputType);
       auto newTdescTy = xegpu::TensorDescType::get(
           tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(),
           tdescTy.getEncoding(), inputLayout);
-      input.setType(newTdescTy);
+      regionArg.setType(newTdescTy);
       // Store the layout for the result.
       if (resultToLayouts.count(result) != 0 &&
           resultToLayouts[result] != inputLayout) {
@@ -837,7 +840,6 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder,
     }
     // If the result is a vector type, add a temporary layout attribute to
     // the op.
-    std::string resultLayoutName = xegpu::getLayoutName(r);
     xegpu::setLayoutAttr(r, layout);
   }
 }
@@ -865,7 +867,6 @@ static void updateFunctionOpInterface(mlir::OpBuilder &builder,
           tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
       arg.setType(newTdescTy);
       newArgTypes.back() = newTdescTy;
-      continue;
     }
   }
   // Update the function type with the new argument types.
@@ -887,9 +888,9 @@ void XeGPULayoutPropagatePass::runOnOperation() {
   // Helper to convert LayoutInfo to xegpu::LayoutAttr.
   auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr {
     LayoutInfo layout = analyis.getLayoutInfo(val);
-    if (!layout.isAssigned()) {
+    if (!layout.isAssigned())
       return {};
-    }
+
     SmallVector<int, 2> laneLayout, laneData;
     for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
                                                layout.getDataAsArrayRef())) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -97,9 +97,9 @@ getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
   // dimensions are not distributed.
   unsigned distributionStart = originalType.getRank() - laneLayout.size();
   for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
-    if (i < distributionStart) {
+    if (i < distributionStart)
       continue;
-    }
+
     // Check if the dimension can be distributed evenly.
     if (dim % laneLayout[i - distributionStart] != 0)
       return failure();
@@ -848,9 +848,8 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     // GPU index ops, scalar constants, etc.). This will simplify the
     // later lowering and avoid custom patterns for these ops.
     getOperation()->walk([&](Operation *op) {
-      if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op)) {
+      if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
         vector::moveScalarUniformCode(warpOp);
-      }
     });
   }
   // Step 3: Apply subgroup to workitem distribution patterns.
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -166,8 +166,8 @@ gpu.module @test {
 }
 
 // -----
-// TODO: gemm does not use update_nd_offset because of an issue in vector distribution. PR141853 tracks this issue.
-// CHECK-LABEL: gpu.func @gemm_loop
+// TODO: gemm does not use update_nd_offset because of an issue in scf-for distribution.
+// CHECK-LABEL: gpu.func @gemm
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
 // CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x
 // CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y
@@ -189,7 +189,7 @@ gpu.module @test {
 // CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
 // CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
 gpu.module @test {
-gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
+gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
   %c0 = arith.constant 0 : index
   %c16 = arith.constant 16 : index
   %c8 = arith.constant 8 : index