[Blackwell] Fix codegen for tmem_load of Nx1xf32 (#7234)

Mogball · web-flow · commit b3b1b13a5168 · 2025-06-20T12:26:12.000-04:00
LLVM's `inline_asm` is only allowed to return a struct when the asm has
more than 1 result. This also makes unpacked `Nx2xf16` work but
`Nx1xf16` still crashes. It can be supported if it is needed.
diff --git a/test/Conversion/tritongpu_to_llvm_blackwell.mlir b/test/Conversion/tritongpu_to_llvm_blackwell.mlir
@@ -495,6 +495,7 @@ tt.func public @tmem_message_maxnreg_80(%desc: !ttg.memdesc<128x64xf32, #tmem, #
   tt.return
 }
 
+// CHECK-LABEL: @module_constraint_supercedes_local
 tt.func public @module_constraint_supercedes_local(%desc: !ttg.memdesc<128x64xf32, #tmem, #ttng.tensor_memory>) {
   ttg.warp_specialize(%desc) attributes {actualRegisters = array<i32: 256, 256>}
   default {
@@ -611,6 +612,10 @@ tt.func private @reinterpret(%arg0: !ttg.memdesc<128xf32, #tmem, #ttng.tensor_me
 
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = false>
 #tmem_unpacked = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
+#tmem_x1 = #ttng.tensor_memory_encoding<blockM = 128, blockN = 1, unpacked = false>
+#tmem_x1_unpacked = #ttng.tensor_memory_encoding<blockM = 128, blockN = 2, unpacked = true>
+
+#blocked_x1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
 
 module attributes {"ttg.num-warps" = 4 : i32} {
 
@@ -633,4 +638,29 @@ tt.func private @subslice_packed(%arg0: !ttg.memdesc<128x128xf16, #tmem, #ttng.t
   tt.return %0 : !ttg.memdesc<128x64xf16, #tmem, #ttng.tensor_memory>
 }
 
+// CHECK-LABEL: @load_store_x1
+tt.func @load_store_x1(%arg0: !ttg.memdesc<128x1xf32, #tmem_x1, #ttng.tensor_memory, mutable>) {
+  %true = arith.constant true
+  // CHECK: [[V:%.*]] = llvm.inline_asm {{.*}}tcgen05.ld.sync{{.*}} (i32) -> i32
+  // CHECK: [[F:%.*]] = llvm.bitcast [[V]] : i32 to f32
+  // CHECK: insertvalue [[F]], {{.*}} : !llvm.struct<(f32)>
+  %0 = ttng.tmem_load %arg0 : !ttg.memdesc<128x1xf32, #tmem_x1, #ttng.tensor_memory, mutable> -> tensor<128x1xf32, #blocked_x1>
+  ttng.tmem_store %0, %arg0, %true : tensor<128x1xf32, #blocked_x1> -> !ttg.memdesc<128x1xf32, #tmem_x1, #ttng.tensor_memory, mutable>
+  tt.return
+}
+
+// CHECK-LABEL: @load_store_x1_unpacked
+tt.func @load_store_x1_unpacked(%arg0: !ttg.memdesc<128x2xf16, #tmem_x1_unpacked, #ttng.tensor_memory, mutable>) {
+  %true = arith.constant true
+  // CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
+  // CHECK: [[C1:%.*]] = llvm.mlir.constant(1 : i32)
+  // CHECK: [[V:%.*]] = llvm.inline_asm {{.*}}tcgen05.ld.sync{{.*}} (i32) -> i32
+  // CHECK: [[F:%.*]] = llvm.bitcast [[V]] : i32 to vector<2xf16>
+  // CHECK: extractelement [[F]][[[C0]] : i32]
+  // CHECK: extractelement [[F]][[[C1]] : i32]
+  %0 = ttng.tmem_load %arg0 : !ttg.memdesc<128x2xf16, #tmem_x1_unpacked, #ttng.tensor_memory, mutable> -> tensor<128x2xf16, #blocked_x1>
+  ttng.tmem_store %0, %arg0, %true : tensor<128x2xf16, #blocked_x1> -> !ttg.memdesc<128x2xf16, #tmem_x1_unpacked, #ttng.tensor_memory, mutable>
+  tt.return
+}
+
 }
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp
@@ -355,6 +355,7 @@ void calculateAddressAndEmitTmemMessage(
     // are required to cover the entire set of rows per warp.
     int numRowPerWarp =
         (info.layoutAtom.rowStored == 16 && info.blockM == 64) ? 16 : 32;
+
     for (int rowStart = 0; rowStart < numRowPerWarp;
          rowStart += message.numRows) {
       for (int colStart = 0; colStart < numColumns;
@@ -590,10 +591,17 @@ Value createTensorMemoryLoad(Location loc, triton::nvidia_gpu::TMEMLoadOp op,
   operands.push_back(ptxBuilder.newOperand(address, "r"));
   auto &ld = *ptxBuilder.create<PTXInstr>(opcode);
   ld(operands, /*onlyAttachMLIRArgs=*/true);
-  SmallVector<Type> elemTypes(numRegPerMessage, i32_ty);
-  MLIRContext *ctx = op.getContext();
-  Type structTy = struct_ty(elemTypes);
-  Value ret = ptxBuilder.launch(rewriter, loc, structTy);
+
+  // LLVM inline_asm with 1 result cannot return a struct.
+  Type retTy;
+  if (numRegPerMessage == 1) {
+    retTy = i32_ty;
+  } else {
+    SmallVector<Type> elemTypes(numRegPerMessage, i32_ty);
+    MLIRContext *ctx = op.getContext();
+    retTy = struct_ty(elemTypes);
+  }
+  Value ret = ptxBuilder.launch(rewriter, loc, retTy);
   return ret;
 }
 
@@ -606,8 +614,8 @@ static SmallVector<Value> unpackResults(Value packedValues, Type elemTy,
   Type packedType = elemTy;
   if (numElementsPer32B > 1)
     packedType = vec_ty(elemTy, numElementsPer32B);
-  for (int i = 0; i < numCols; i++) {
-    Value result = b.extract_val(i32_ty, packedValues, i);
+
+  auto unpackElement = [&](Value result) {
     result = b.bitcast(result, packedType);
     if (numElementsPer32B > 1) {
       for (int j = 0; j < numElementsPer32B; j++) {
@@ -617,6 +625,15 @@ static SmallVector<Value> unpackResults(Value packedValues, Type elemTy,
     } else {
       resultVals.push_back(result);
     }
+  };
+
+  if (isa<LLVM::LLVMStructType>(packedValues.getType())) {
+    for (int i = 0; i < numCols; i++) {
+      Value result = b.extract_val(i32_ty, packedValues, i);
+      unpackElement(result);
+    }
+  } else {
+    unpackElement(packedValues);
   }
   return resultVals;
 }