[BACKEND] Fix inline asm bug for multiple packed <32bit output (#5273)

ggengnv · web-flow · commit 7b2beae97ade · 2024-11-27T14:26:11.000-08:00
Resolves #5272 - Fixes logic for walking result struct from LLVM InlineAsm in case of multiple sub-32bit results - Adds lit test
diff --git a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -325,13 +325,12 @@ struct ElementwiseInlineAsmOpConversion
     // asmResults is a flat struct; pack its values into
     // [return_value][op.getPackedElement()].
     SmallVector<SmallVector<Value>> ret(op->getNumResults());
+    int structIdx = 0;
     for (int i = 0; i < op->getNumResults(); i++) {
-      int structIdx = 0;
       for (int j = 0; j < op.getPackedElement(); j++) {
         Value val;
         if (asmRetTypes.size() > 1) {
-          val =
-              extract_val(asmResults, i * op.getPackedElement() + structIdx++);
+          val = extract_val(asmResults, structIdx++);
         } else {
           val = asmResults;
         }
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1897,3 +1897,16 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+// CHECK: inline_asm_pack
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
+  // check specifically for the case where asm has two results, pack > 1, and the result bitwidth is < 32
+  tt.func public @inline_asm_pack(%80: tensor<64x64xi8, #blocked>) attributes {noinline = false} {
+    // CHECK: llvm.inline_asm asm_dialect {{.*}} (vector<4xi8>) -> !llvm.struct<(vector<2xbf16>, vector<2xbf16>, vector<2xbf16>, vector<2xbf16>)>
+    %83:2 = tt.elementwise_inline_asm "" {constraints = "=r,=r,=r,=r,r", packed_element = 4 : i32, pure = true} %80 : tensor<64x64xi8, #blocked> -> tensor<64x64xbf16, #blocked>, tensor<64x64xbf16, #blocked>
+    tt.return
+  }
+}