[AMD] Optimize gfx9 wave id code generation (#8601)

zwu-2025 · web-flow · commit de8e71503fea · 2025-10-31T23:32:39.000-07:00
On GFX9, this PR lifts computations of `wave_id` to
the entry of the function and additionally emit
`lvm.amdgcn.readfirstlane`. This gives us optimized
code generation inside the loop.
diff --git a/test/Conversion/amd/buffer_load_to_local_to_llvm.mlir b/test/Conversion/amd/buffer_load_to_local_to_llvm.mlir
@@ -187,11 +187,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
                                 %arg2: !ttg.memdesc<64xf32, #shared, #smem, mutable>) {
     %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #blocked>
     // The first constant 0 skips the LDS offset which is also 0
-    // COMMON: llvm.getelementptr
-    // COMMON: llvm.mlir.constant(0 : i32) : i32
-    // COMMON: %[[aux_ca:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // COMMON: llvm.mlir.constant(0 : i32) : i32
-    // COMMON: rocdl.raw.ptr.buffer.load.lds {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[aux_ca]]
+    // COMMON: %[[VOFFSET:.*]] = llvm.select
+    // COMMON-NEXT: %[[IMM0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // COMMON-NEXT: %[[aux_ca:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // COMMON-NEXT: %[[IMM1:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // COMMON-NEXT: rocdl.raw.ptr.buffer.load.lds {{.*}}, {{.*}}, {{.*}}, %[[VOFFSET]], %[[IMM1]], %[[IMM0]], %[[aux_ca]]
     %1 = amdgpu.buffer_load_to_local %arg0[%0] cacheModifier = ca into %arg2: <f32>[tensor<64xi32, #blocked>] -> <64xf32, #shared, #smem, mutable>
     // COMMON: llvm.getelementptr
     // COMMON: %[[aux_cg:.*]] = llvm.mlir.constant(3 : i32) : i32
@@ -328,3 +328,38 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.sha
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: buffer_load_to_local_wave_id
+  tt.func public @buffer_load_to_local_wave_id(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg2: !ttg.memdesc<64xf32, #shared, #smem, mutable>, %arg3: i32) {
+    // COMMON: %0 = rocdl.workitem.id.x : i32
+    // COMMON-NEXT: %1 = llvm.mlir.constant(63 : i32) : i32
+    // COMMON-NEXT: %2 = llvm.and %0, %1 : i32
+    // COMMON-NEXT: %3 = llvm.mlir.constant(64 : i32) : i32
+    // COMMON-NEXT: %4 = llvm.mlir.constant(0 : i32) : i32
+    // COMMON-NEXT: %5 = llvm.call_intrinsic "llvm.amdgcn.readfirstlane"(%4) : (i32) -> i32
+    // COMMON-NEXT: %6 = rocdl.workitem.id.x : i32
+    // COMMON-NEXT: %7 = llvm.mlir.constant(63 : i32) : i32
+    // COMMON-NEXT: %8 = llvm.and %6, %7 : i32
+    // COMMON-NEXT: %9 = llvm.mlir.constant(64 : i32) : i32
+    // COMMON-NEXT: %10 = llvm.mlir.constant(0 : i32) : i32
+    // COMMON-NEXT: %11 = llvm.call_intrinsic "llvm.amdgcn.readfirstlane"(%10) : (i32) -> i32
+
+    %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #blocked>
+    %1 = amdgpu.buffer_load_to_local %arg0[%0] into %arg2: <f32>[tensor<64xi32, #blocked>] -> <64xf32, #shared, #smem, mutable>
+    %c0_i32 = arith.constant 0 : i32
+    %cond = llvm.icmp "eq" %arg3, %c0_i32 : i32
+    cf.cond_br %cond, ^bb1, ^bb2
+    ^bb1:
+      amdgpu.buffer_load_to_local %arg0[%0] into %arg2: <f32>[tensor<64xi32, #blocked>] -> <64xf32, #shared, #smem, mutable>
+      cf.br ^bb1
+    ^bb2:
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -482,7 +482,7 @@ struct DirectToLdsLoadConversionBase : public LoadStoreConversionBase {
   void lowerDirectToLDSLoad(
       RewriterBase &rewriter, Location loc, RankedTensorType srcTy,
       MemDescType dstTy, SmallVector<Value> loadVals, Value llDst,
-      Type resElemTy, unsigned vec,
+      Type resElemTy, unsigned vec, triton::AMD::ISAFamily isaFamily,
       std::function<SmallVector<Value>(RewriterBase &, Location,
                                        ArrayRef<Value>, Value, int, VectorType)>
           lowerInst) const {
@@ -511,7 +511,40 @@ struct DirectToLdsLoadConversionBase : public LoadStoreConversionBase {
         LLVM::getSharedMemoryObjectFromStruct(loc, llDst, resElemTy, rewriter);
     auto affineOffset = smemObj.getShmemOffset(loc, rewriter, dstTy);
     auto maskSpanAffineOffset = SharedMemoryObject::getMaskSpanOffsets(dstTy);
-    auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
+
+    Value laneId, warpId;
+    if (ISAFamily::CDNA3 == isaFamily || ISAFamily::CDNA4 == isaFamily) {
+      // On GFX9, there is no dedicated hardware instruction to read `wave_id`.
+      // The value is instead computed from `workitem.id.x`. Per the GFX9 ABI,
+      // `workitem.id.x` is initialized in a vector register, and vector
+      // instructions are generated for IR operations that depend on `wave_id`.
+      //
+      // A `v_readfirstlane` instruction is inserted at the end of these vector
+      // sequences to transfer the value from a vector register to a scalar
+      // register, initializing `$m0`.
+
+      // When this sequence occurs inside a loop, the MachineLICM pass does not
+      // hoist it because `v_readfirstlane` is convergent. Since both
+      // `workitem.id.x` and `wave_id` are constant at runtime, their
+      // computation can be safely hoisted to the function entry block.
+      auto insertPt = rewriter.saveInsertionPoint();
+      Operation *parentOp = insertPt.getBlock()->getParentOp();
+      while (!isa<LLVM::LLVMFuncOp>(parentOp)) {
+        parentOp = parentOp->getParentOp();
+      }
+
+      auto funcOp = cast<LLVM::LLVMFuncOp>(parentOp);
+      rewriter.setInsertionPointToStart(&funcOp.getBody().front());
+
+      std::tie(laneId, warpId) = getLaneAndWarpId(rewriter, loc);
+      auto call = LLVM::createLLVMIntrinsicCallOp(
+          rewriter, loc, "llvm.amdgcn.readfirstlane", {i32_ty}, {warpId});
+      warpId = call.getResult(0);
+      rewriter.restoreInsertionPoint(insertPt);
+    } else {
+      std::tie(laneId, warpId) = getLaneAndWarpId(rewriter, loc);
+    }
+
     auto calcPaddedOffset = [&](Value smemOffset) {
       TritonLLVMOpBuilder b(loc, rewriter);
       auto bitwidth = dstTy.getElementTypeBitWidth();
@@ -873,7 +906,8 @@ struct BufferLoadToLocalOpConversion
     };
 
     lowerDirectToLDSLoad(rewriter, loc, ptrType, flatDstTy, loadVals, llDst,
-                         resElemTy, vec, emitBufferLoadLds);
+                         resElemTy, vec, targetInfo.getISAFamily(),
+                         emitBufferLoadLds);
 
     // Drop the result token.
     Value zero = LLVM::ConstantOp::create(rewriter, op.getLoc(),
@@ -999,7 +1033,8 @@ struct AsyncCopyGlobalToLocalOpConversion
     };
 
     lowerDirectToLDSLoad(rewriter, loc, srcTy, flatDstTy, loadVals, llDst,
-                         resElemTy, vec, emitGlobalLoadLds);
+                         resElemTy, vec, targetInfo.getISAFamily(),
+                         emitGlobalLoadLds);
 
     // Drop the result token.
     Value zero = LLVM::ConstantOp::create(rewriter, op.getLoc(),