[AMD] Emit llvm.amdgcn.wave.id for GFX12 (#8817)

antiagainst · web-flow · commit 5dc3032fde9c · 2025-11-24T11:05:08.000-08:00
diff --git a/test/Conversion/amd/buffer_load_to_local_to_llvm.mlir b/test/Conversion/amd/buffer_load_to_local_to_llvm.mlir
@@ -331,42 +331,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.sha
 
 // -----
 
-#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
-#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
-#smem = #ttg.shared_memory
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
-  // COMMON-LABEL: buffer_load_to_local_wave_id
-  tt.func public @buffer_load_to_local_wave_id(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
-                                %arg2: !ttg.memdesc<64xf32, #shared, #smem, mutable>, %arg3: i32) {
-    //      COMMON: %[[C64:.+]] = llvm.mlir.constant(64 : i32) : i32
-    // COMMON-NEXT: %[[IDX:.+]] = rocdl.workitem.id.x : i32
-    // COMMON-NEXT: %[[C63:.+]] = llvm.mlir.constant(63 : i32) : i32
-    // COMMON-NEXT: %[[AND:.+]] = llvm.and %[[IDX]], %[[C63]] : i32
-    // COMMON-NEXT: %[[DIV:.+]] = llvm.udiv %[[AND]], %[[C64]] : i32
-    // COMMON-NEXT: %{{.+}} = rocdl.readfirstlane %[[DIV]] : i32
-
-    //      COMMON: %[[C64:.+]] = llvm.mlir.constant(64 : i32) : i32
-    // COMMON-NEXT: %[[IDX:.+]] = rocdl.workitem.id.x : i32
-    // COMMON-NEXT: %[[C63:.+]] = llvm.mlir.constant(63 : i32) : i32
-    // COMMON-NEXT: %[[AND:.+]] = llvm.and %[[IDX]], %[[C63]] : i32
-    // COMMON-NEXT: %[[DIV:.+]] = llvm.udiv %[[AND]], %[[C64]] : i32
-    // COMMON-NEXT: %{{.+}} = rocdl.readfirstlane %[[DIV]] : i32
-
-    %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #blocked>
-    %1 = amdg.buffer_load_to_local %arg0[%0] into %arg2: <f32>[tensor<64xi32, #blocked>] -> <64xf32, #shared, #smem, mutable>
-    %c0_i32 = arith.constant 0 : i32
-    %cond = llvm.icmp "eq" %arg3, %c0_i32 : i32
-    cf.cond_br %cond, ^bb1, ^bb2
-    ^bb1:
-      amdg.buffer_load_to_local %arg0[%0] into %arg2: <f32>[tensor<64xi32, #blocked>] -> <64xf32, #shared, #smem, mutable>
-      cf.br ^bb1
-    ^bb2:
-    tt.return
-  }
-}
-
-// -----
-
 #blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
 #shared1D = #ttg.swizzled_shared<{vec = 2, perPhase = 1, maxPhase = 8, order = [0]}>
 #smem = #ttg.shared_memory
diff --git a/test/Conversion/amd/warp_id_to_llvm.mlir b/test/Conversion/amd/warp_id_to_llvm.mlir
@@ -0,0 +1,29 @@
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx942  | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx950  | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx1200 | FileCheck %s --check-prefixes=CHECK,GFX12
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx1250 | FileCheck %s --check-prefixes=CHECK,GFX12
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 0 : i32, "ttg.threads-per-warp" = 64 : i32} {
+
+// CHECK-LABEL: @wave_id
+tt.func public @wave_id() {
+  //       GFX9: %[[C64:.+]] = llvm.mlir.constant(64 : i32) : i32
+  //  GFX9-NEXT: %[[IDX:.+]] = rocdl.workitem.id.x : i32
+  //  GFX9-NEXT: %[[C63:.+]] = llvm.mlir.constant(63 : i32) : i32
+  //  GFX9-NEXT: %[[AND:.+]] = llvm.and %[[IDX]], %[[C63]] : i32
+  //  GFX9-NEXT: %[[DIV:.+]] = llvm.udiv %[[AND]], %[[C64]] : i32
+  //  GFX9-NEXT: %{{.+}} = rocdl.readfirstlane %[[DIV]] : i32
+
+  // GFX12-NEXT: llvm.call_intrinsic "llvm.amdgcn.wave.id"
+  //      CHECK: scf.for
+
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  scf.for %i = %c0 to %c1 step %c1 {
+    %1 = "ttg.warp_id"() : () -> i32
+    scf.yield
+  }
+  tt.return
+}
+
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/WarpIdOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/WarpIdOpToLLVM.cpp
@@ -33,8 +33,15 @@ class WarpIdOpPattern : public ConvertOpToLLVMPattern<WarpIdOp> {
         &funcOp.getFunctionBody().getBlocks().front().front());
 
     auto loc = op.getLoc();
-    auto b = TritonLLVMOpBuilder(loc, rewriter);
     auto isaFamily = targetInfo.getISAFamily();
+    if (ISAFamily::RDNA4 == isaFamily || ISAFamily::GFX1250 == isaFamily) {
+      auto warpIdOp = LLVM::createLLVMIntrinsicCallOp(
+          rewriter, loc, "llvm.amdgcn.wave.id", {i32_ty}, ValueRange{});
+      rewriter.replaceOp(op, warpIdOp.getResult(0));
+      return success();
+    }
+
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
     int threadsPerWarp = triton::gpu::lookupThreadsPerWarp(rewriter);
     Value warpSizeVal = b.i32_val(threadsPerWarp);
     Value tid = getThreadId(rewriter, loc);