Properly handle while loop args in OptimizeDescriptorEncoding (#7297)

alexbaden · web-flow · commit e71689d7c7db · 2025-06-24T18:52:45.000-07:00
Fixes the handling of while loop args to ensure the condition op / yield
op in the before body is properly handled and to ensure the after body
arguments are properly handled. I added a somewhat contrived lit test
that reproduces the problem - the second gather + local load is
necessary to force a specific layout propagation through the while loop.
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -1596,10 +1596,10 @@ SmallVector<Value> getTiedArgs(Operation *op, int resultIdx) {
   } else if (auto whileOp = dyn_cast<scf::WhileOp>(op)) {
     auto iterArg = whileOp.getBeforeArguments()[resultIdx];
     auto result = whileOp.getResults()[resultIdx];
-    auto yieldVal =
-        whileOp.getBeforeBody()->getTerminator()->getOperand(resultIdx);
+    auto yieldVal = whileOp.getConditionOp().getArgs()[resultIdx];
     auto initVal = whileOp.getOperands()[resultIdx];
-    return {iterArg, result, iterArg, initVal};
+    auto bodyArg = whileOp.getAfterArguments()[resultIdx];
+    return {iterArg, result, yieldVal, initVal, bodyArg};
   } else if (auto ifOp = dyn_cast<scf::IfOp>(op)) {
     SmallVector<Value> values;
     for (auto &block : ifOp.getThenRegion().getBlocks()) {
diff --git a/test/TritonNvidiaGPU/optimize_descriptor_encoding.mlir b/test/TritonNvidiaGPU/optimize_descriptor_encoding.mlir
@@ -83,3 +83,43 @@ tt.func public @descriptor_kernel_arg(%arg0: !tt.tensordesc<tensor<64x64xf16>>,
   tt.return
 }
 }
+
+// -----
+
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>
+#smem = #ttg.shared_memory
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100"} {
+// CHECK-DAG: #[[BLOCKED:.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+// CHECK-DAG: #[[NVMMA_32:.*]] = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>
+tt.func public @tma_load_while(%arg0: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}, %arg3: tensor<32xi32, #blocked>, %cond: i1) {
+    %c1_i32 = arith.constant 1 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %c1_i64 = arith.constant 1 : i64
+
+    %0 = arith.extsi %arg2 : i32 to i64
+    // CHECK: tt.make_tensor_descriptor {{.*}} : <i8>, <tensor<1x32xi8, #[[NVMMA_32]]>>
+    %1 = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64] : <i8>, <tensor<1x32xi8>>
+
+    %2 = scf.while (%arg4 = %1) : (!tt.tensordesc<tensor<1x32xi8>>) -> (!tt.tensordesc<tensor<1x32xi8>>) {
+        scf.condition(%cond) %arg4 : !tt.tensordesc<tensor<1x32xi8>>
+    } do {
+        ^bb0(%arg4: !tt.tensordesc<tensor<1x32xi8>>):
+          // CHECK: ^bb0(%[[ARG4:.*]]: !tt.tensordesc<tensor<1x32xi8, #[[NVMMA_32]]>>):
+          // CHECK: tt.descriptor_gather %[[ARG4]][{{.*}}] : (!tt.tensordesc<tensor<1x32xi8, #[[NVMMA_32]]>>
+          %3 = tt.descriptor_gather %arg4[%arg3, %c8_i32] : (!tt.tensordesc<tensor<1x32xi8>>, tensor<32xi32, #blocked>, i32) -> tensor<32x32xi8, #blocked1>
+
+        scf.yield %arg4 : !tt.tensordesc<tensor<1x32xi8>>
+    }
+
+  // CHECK: %[[GATHER:.*]] = tt.descriptor_gather {{.*}} : (!tt.tensordesc<tensor<1x32xi8, #[[NVMMA_32]]>>
+    %4 = tt.descriptor_gather %1[%arg3, %c8_i32] : (!tt.tensordesc<tensor<1x32xi8>>, tensor<32xi32, #blocked>, i32) -> tensor<32x32xi8, #blocked1>
+    // CHECK: ttg.local_alloc %[[GATHER]] {{.*}} : (tensor<32x32xi8, #blocked1>) -> !ttg.memdesc<32x32xi8, #[[NVMMA_32]], #smem>
+    %8 = ttg.local_alloc %4 {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<32x32xi8, #blocked1>) -> !ttg.memdesc<32x32xi8, #shared, #smem>
+
+  tt.return
+}
+}