[BACKEND] Fix unnecessary cvt caused by wgmma wait op (#8579)

peterbell10 · web-flow · commit 430f8b290d90 · 2025-10-29T19:05:41.000Z
Fixes #8578

We're using the wrong output constraint which leads llvm to extend the
fp16 value to 32-bits. Fixing the constraint removes the conversion.

Note that we still end up with a no-op sequence like:
```ptx
mov.b32 {%rs1, %rs2}, %r1
mov.b32 %r2, {%rs1, %rs2}
```

However, `ptxas` is able to optimize these out.
diff --git a/test/Conversion/nvgpu_to_llvm.mlir b/test/Conversion/nvgpu_to_llvm.mlir
@@ -58,6 +58,19 @@ llvm.func @wgmma(%desc: i64, %in: !struct_64xf32) {
 
 // -----
 
+!struct = !llvm.struct<(f32, f32, i32, i32, f16, f16)>
+
+// CHECK-LABEL: @wgmma_wait
+llvm.func @wgmma_wait(%in: !struct) {
+  // CHECK: // wait for regs: $0,$1,$2,$3,$4,$5
+  // CHECK: wgmma.wait_group.sync.aligned 0;
+  // CHECK: "=f,=f,=r,=r,=h,=h,0,1,2,3,4,5"
+  %out = nvgpu.wgmma_wait_group %in {pendings = 0 : i32} : !struct
+  llvm.return
+}
+
+// -----
+
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tensor_memory_base_lowering
   //      CHECK:    %[[TID:.+]] = nvvm.read.ptx.sreg.tid.x : i32
diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
@@ -303,9 +303,29 @@ class WGMMAWaitGroupOpPattern : public OpRewritePattern<ttn::WGMMAWaitGroupOp> {
   Constraints getOutputConstraints(ttn::WGMMAWaitGroupOp op) const {
     auto outputStructType = cast<LLVM::LLVMStructType>(op.getType());
     uint32_t numOutputRegs = outputStructType.getBody().size();
-    std::string output =
-        outputStructType.getBody().front().isF32() ? "=f" : "=r";
-    return Constraints(numOutputRegs, output);
+    Constraints constraints;
+    constraints.reserve(numOutputRegs);
+    mlir::DataLayout dl(op->getParentOfType<mlir::ModuleOp>());
+    for (auto ty : outputStructType.getBody()) {
+      auto bitwidth = dl.getTypeSizeInBits(ty);
+      std::string c;
+      switch (bitwidth) {
+      case 64:
+        c = "=l";
+        break;
+      case 32:
+        c = ty.isF32() ? "=f" : "=r";
+        break;
+      case 16:
+        c = "=h";
+        break;
+      default:
+        llvm::report_fatal_error("Unexpected bitwidth in WGMMAWaitGroupOp: " +
+                                 Twine(bitwidth));
+      }
+      constraints.push_back(c);
+    }
+    return constraints;
   }
 
   OperandsAndConstraints