[Cherry-pick] [Dialect] Make warp specialization require at least 4 warps (#8005) (#548)

Mogball · meta-codesync[bot] · commit 3c47763d8b25 · 2025-10-31T14:27:50.000-07:00
Summary: Cherry-picked from upstream OAI repository. Original Commit: cfc0a9d Original Author: Jeff Niu Original Date: 2025-08-29 09:45:18 -0700 Original commit message: ``` [Dialect] Make warp specialization require at least 4 warps (#8005) The warpgroup allocator makes fairly strong assumptions that the default number of warps is at least 4. Untangling this is non-trivial (see #7940 which is WIP to fix this). For now, just add an error message to prevent the compiler from crashing and confusing users. ``` This PR was automatically cherry-picked from the upstream triton-lang/triton repository. Pull Request resolved: #548 Reviewed By: agron911, htyu Differential Revision: D85908114 Pulled By: dshi7 fbshipit-source-id: af0a7353916a7ac70a151129d26357dee2f83518
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -908,6 +908,14 @@ LogicalResult WarpSpecializeOp::verify() {
         "cannot be nested inside another `ttg.warp_specialize` op");
   }
 
+  std::optional<int> numWarps = maybeLookupNumWarps(*this);
+  if (numWarps && *numWarps % 4 != 0) {
+    return mlir::emitError(getLoc())
+           << "warp-specialized kernels requires "
+              "num_warps to be a multiple of 4 but num_warps="
+           << *numWarps;
+  }
+
   return success();
 }
 
diff --git a/python/test/unit/language/test_tlx.py b/python/test/unit/language/test_tlx.py
@@ -1661,7 +1661,7 @@ def tcgen5_fa_kernel(a_ptr, stride_am, stride_ak, b_ptr, stride_bk, stride_bn, c
 
     kern_kwargs = {'BLOCK_M': M, 'BLOCK_K': K, 'BLOCK_N': N}
     kernel = tcgen5_fa_kernel[(1, 1)](a, a.stride(0), a.stride(1), b, b.stride(0), b.stride(1), c, c.stride(0),
-                                      c.stride(1), d, d.stride(0), d.stride(1), **kern_kwargs, num_warps=1)
+                                      c.stride(1), d, d.stride(0), d.stride(1), **kern_kwargs, num_warps=4)
 
     ttgir = kernel.asm["ttgir"]
     assert ttgir.count("ttng.tmem_alloc") == 2
diff --git a/test/Conversion/triton_to_tritongpu.mlir b/test/Conversion/triton_to_tritongpu.mlir
@@ -168,64 +168,3 @@ tt.func @cf_br(%ptr: !tt.ptr<i32>) {
   tt.store %ptrs, %arg0 : tensor<128x!tt.ptr<i32>>
   tt.return
 }
-
-// -----
-
-// CHECK-LABEL: @legalize_warp_specialize
-tt.func @legalize_warp_specialize(%arg0: !tt.ptr<i32>, %arg1: !tt.ptr<i32>) {
-  ttg.warp_specialize(%arg0)
-  default {
-    ttg.warp_yield
-  }
-  partition0(%arg2: !tt.ptr<i32>) num_warps(2) {
-    // CHECK: tt.splat {{.*}} : !tt.ptr<i32> -> tensor<256x!tt.ptr<i32>, #blocked>
-    // CHECK: tt.load {{.*}} : tensor<256x!tt.ptr<i32>, #blocked>
-    %splatted = tt.splat %arg2 : !tt.ptr<i32> -> tensor<256x!tt.ptr<i32>>
-    %input = tt.load %splatted : tensor<256x!tt.ptr<i32>>
-    ttg.warp_return
-  } : (!tt.ptr<i32>) -> ()
-  tt.return
-}
-
-
-// -----
-// CHECK-DAG: [[DEFAULT:#.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}>
-// CHECK-DAG: [[WS1:#.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
-// CHECK: @legalize_warp_partition
-module attributes {tlx.has_warp_spec_ops = true, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
-  tt.func public @legalize_warp_partition(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
-    %c1024_i32 = arith.constant 1024 : i32
-    %0 = tt.get_program_id x : i32
-    %1 = arith.muli %0, %c1024_i32 : i32
-    ttg.warp_specialize(%arg3, %1, %arg5)
-    // CHECK: default
-    default {
-      %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
-      %3 = tt.splat %1 : i32 -> tensor<1024xi32>
-      %4 = arith.addi %3, %2 : tensor<1024xi32>
-      %5 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
-      %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
-      // CHECK: tt.load {{.*}} : tensor<1024x!tt.ptr<f32>, [[DEFAULT]]
-      %7 = tt.load %6 : tensor<1024x!tt.ptr<f32>>
-      %8 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
-      %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
-      tt.store %9, %7 : tensor<1024x!tt.ptr<f32>>
-      ttg.warp_yield
-    }
-    // CHECK: partition0
-    partition0(%arg7: !tt.ptr<f32>, %arg8: i32, %arg9: !tt.ptr<f32>) num_warps(1) {
-      %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
-      %3 = tt.splat %arg8 : i32 -> tensor<1024xi32>
-      %4 = arith.addi %3, %2 : tensor<1024xi32>
-      %5 = tt.splat %arg7 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
-      %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
-      // CHECK: tt.load {{.*}} : tensor<1024x!tt.ptr<f32>, [[WS1]]
-      %7 = tt.load %6 : tensor<1024x!tt.ptr<f32>>
-      %8 = tt.splat %arg9 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
-      %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
-      tt.store %9, %7 : tensor<1024x!tt.ptr<f32>>
-      ttg.warp_return
-    } : (!tt.ptr<f32>, i32, !tt.ptr<f32>) -> ()
-    tt.return
-  }
-}
diff --git a/test/Conversion/ttg_warp_specialize.mlir b/test/Conversion/ttg_warp_specialize.mlir
@@ -0,0 +1,62 @@
+// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=cuda:80 num-warps=4' | FileCheck %s
+
+// CHECK-LABEL: @legalize_warp_specialize
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+tt.func @legalize_warp_specialize(%arg0: !tt.ptr<i32>, %arg1: !tt.ptr<i32>) {
+  ttg.warp_specialize(%arg0)
+  default {
+    ttg.warp_yield
+  }
+  partition0(%arg2: !tt.ptr<i32>) num_warps(2) {
+    // CHECK: tt.splat {{.*}} : !tt.ptr<i32> -> tensor<256x!tt.ptr<i32>, #blocked>
+    // CHECK: tt.load {{.*}} : tensor<256x!tt.ptr<i32>, #blocked>
+    %splatted = tt.splat %arg2 : !tt.ptr<i32> -> tensor<256x!tt.ptr<i32>>
+    %input = tt.load %splatted : tensor<256x!tt.ptr<i32>>
+    ttg.warp_return
+  } : (!tt.ptr<i32>) -> ()
+  tt.return
+}
+}
+
+
+// -----
+// CHECK-DAG: [[DEFAULT:#.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+// CHECK-DAG: [[WS1:#.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
+// CHECK: @legalize_warp_partition
+module attributes {tlx.has_warp_spec_ops = true, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @legalize_warp_partition(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    ttg.warp_specialize(%arg3, %1, %arg5)
+    // CHECK: default
+    default {
+      %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+      %3 = tt.splat %1 : i32 -> tensor<1024xi32>
+      %4 = arith.addi %3, %2 : tensor<1024xi32>
+      %5 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+      %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+      // CHECK: tt.load {{.*}} : tensor<1024x!tt.ptr<f32>, [[DEFAULT]]
+      %7 = tt.load %6 : tensor<1024x!tt.ptr<f32>>
+      %8 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+      %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+      tt.store %9, %7 : tensor<1024x!tt.ptr<f32>>
+      ttg.warp_yield
+    }
+    // CHECK: partition0
+    partition0(%arg7: !tt.ptr<f32>, %arg8: i32, %arg9: !tt.ptr<f32>) num_warps(1) {
+      %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+      %3 = tt.splat %arg8 : i32 -> tensor<1024xi32>
+      %4 = arith.addi %3, %2 : tensor<1024xi32>
+      %5 = tt.splat %arg7 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+      %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+      // CHECK: tt.load {{.*}} : tensor<1024x!tt.ptr<f32>, [[WS1]]
+      %7 = tt.load %6 : tensor<1024x!tt.ptr<f32>>
+      %8 = tt.splat %arg9 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+      %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+      tt.store %9, %7 : tensor<1024x!tt.ptr<f32>>
+      ttg.warp_return
+    } : (!tt.ptr<f32>, i32, !tt.ptr<f32>) -> ()
+    tt.return
+  }
+}
diff --git a/test/TLX/propagate-layout.mlir b/test/TLX/propagate-layout.mlir
@@ -558,7 +558,7 @@ module attributes {tlx.has_explicit_local_mem_access = true, "ttg.num-ctas" = 1
 // CHECK-DAG: #[[$TMEM:.*]] = #ttng.tensor_memory_encoding<blockM = 64, blockN = 32, unpacked = true>
 // CHECK-DAG: #[[$TMEM1:.*]] = #ttng.tensor_memory_encoding<blockM = 64, blockN = 32, unpacked = false>
 
-module attributes {tlx.has_explicit_local_mem_access = true, tlx.has_tlx_ops = true, tlx.has_warp_spec_ops = true, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+module attributes {tlx.has_explicit_local_mem_access = true, tlx.has_tlx_ops = true, tlx.has_warp_spec_ops = true, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tcgen5_fa_kernel
   tt.func public @tcgen5_fa_kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
     %0 = ttg.local_alloc : () -> !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable>
diff --git a/test/TLX/rewrite-local-alias.mlir b/test/TLX/rewrite-local-alias.mlir
@@ -13,7 +13,7 @@
 // CHECK-DAG: #[[$TMEM:.*]] = #ttng.tensor_memory_encoding<blockM = 64, blockN = 32, unpacked = true>
 // CHECK-DAG: #[[$TMEM1:.*]] = #ttng.tensor_memory_encoding<blockM = 64, blockN = 32, unpacked = false>
 
-module attributes {tlx.has_explicit_local_mem_access = true, tlx.has_tlx_ops = true, tlx.has_warp_spec_ops = true, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+module attributes {tlx.has_explicit_local_mem_access = true, tlx.has_tlx_ops = true, tlx.has_warp_spec_ops = true, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tcgen5_fa_kernel
   tt.func public @tcgen5_fa_kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
     // CHECK: %[[$LOCAL_ALLOC:.*]] = ttg.local_alloc : () -> !ttg.memdesc<1x64x16xf16, #[[$SHARED]], #smem, mutable>
diff --git a/test/TLX/tlx-verifier.mlir b/test/TLX/tlx-verifier.mlir
@@ -1,7 +1,7 @@
 
 // RUN: triton-opt -split-input-file -pass-pipeline='builtin.module(triton-tlx-fixup{num-warps=8 target=cuda:90 num-ctas=2 threads-per-warp=32})' --verify-diagnostics %s
 
-module attributes {tlx.has_warp_spec_ops = true, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+module attributes {tlx.has_warp_spec_ops = true, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
   tt.func public @legalize_warp_partition(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
     %c1024_i32 = arith.constant 1024 : i32
     %0 = tt.get_program_id x : i32

Original file line number	Diff line number	Diff line change
`@@ -908,6 +908,14 @@ LogicalResult WarpSpecializeOp::verify() {`
`908`	`908`	"cannot be nested inside another `ttg.warp_specialize` op");
`909`	`909`	`}`
`910`	`910`
	`911`	`+ std::optional<int> numWarps = maybeLookupNumWarps(*this);`
	`912`	`+ if (numWarps && *numWarps % 4 != 0) {`
	`913`	`+ return mlir::emitError(getLoc())`
	`914`	`+ << "warp-specialized kernels requires "`
	`915`	`+ "num_warps to be a multiple of 4 but num_warps="`
	`916`	`+ << *numWarps;`
	`917`	`+ }`
	`918`	`+`
`911`	`919`	`return success();`
`912`	`920`	`}`
`913`	`921`