Update doc

victor-eds · victor-eds · commit 807676f53ea8 · 2024-10-15T13:14:24.000+01:00
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
@@ -299,7 +299,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
   tt.func @test.work(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>> {
     %0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
     ^bb0(%arg1: f32, %arg2: f32):
-      %1 = arith.maxnumf %arg1, %arg2 : f32
+      %1 = arith.addf %arg1, %arg2 : f32
       tt.reduce.return %1 : f32
     }) : (tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
     tt.return %0 : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
@@ -308,26 +308,32 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     ```
     Is converted to:
     ```mlir
-#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
-#blocked = #triton_gpu.blocked<{sizePerThread = [1, 16, 1], threadsPerWarp = [16, 1, 1], warpsPerCTA = [2, 2, 1], order = [0, 2, 1], CTAsPerCGA = [1, 1, 1], CTASplitNum = [1, 1, 1], CTAOrder = [0, 2, 1]}>
-#blocked2 = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
-
+#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1, 1, 1, 1], threadsPerWarp = [1, 16, 1, 1, 1], warpsPerCTA = [2, 1, 1, 2, 1], order = [4, 0, 1, 2, 3]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16, 1], threadsPerWarp = [16, 1, 1], warpsPerCTA = [2, 1, 2], order = [2, 0, 1]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [2, 2], order = [1, 0]}>
+#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1], A = [8, 8], B = [8, 16], C = [8, 16]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "xpu", "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} {
-  tt.func @test.work(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>> {
-    %0 = tt.reshape %arg0 {allow_reorder = true} : tensor<32x32xf32, #mma> -> tensor<32x32x1xf32, #blocked>
-    %1 = "tt.reduce"(%0) <{axis = 2 : i32}> ({
+  tt.func @test_two_warps_twice(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> {
+    %0 = tt.reshape %arg0 {allow_reorder = true} : tensor<32x32xf32, #mma> -> tensor<32x16x1x2x1xf32, #blocked>
+    %1 = "tt.reduce"(%0) <{axis = 4 : i32}> ({
+    ^bb0(%arg1: f32, %arg2: f32):
+      %7 = arith.addf %arg1, %arg2 : f32
+      tt.reduce.return %7 : f32
+    }) : (tensor<32x16x1x2x1xf32, #blocked>) -> tensor<32x16x1x2xf32, #triton_gpu.slice<{dim = 4, parent = #blocked}>>
+    %2 = "tt.reduce"(%1) <{axis = 2 : i32}> ({
+    ^bb0(%arg1: f32, %arg2: f32):
+      %7 = arith.addf %arg1, %arg2 : f32
+      tt.reduce.return %7 : f32
+    }) : (tensor<32x16x1x2xf32, #triton_gpu.slice<{dim = 4, parent = #blocked}>>) -> tensor<32x16x2xf32, #triton_gpu.slice<{dim = 2, parent = #triton_gpu.slice<{dim = 4, parent = #blocked}>}>>
+    %3 = triton_gpu.convert_layout %2 : tensor<32x16x2xf32, #triton_gpu.slice<{dim = 2, parent = #triton_gpu.slice<{dim = 4, parent = #blocked}>}>> -> tensor<32x16x2xf32, #blocked1>
+    %4 = tt.reshape %3 {allow_reorder = true} : tensor<32x16x2xf32, #blocked1> -> tensor<32x32xf32, #blocked2>
+    %5 = "tt.reduce"(%4) <{axis = 1 : i32}> ({
     ^bb0(%arg1: f32, %arg2: f32):
-      %2 = arith.maxnumf %arg1, %arg2 : f32
-      tt.reduce.return %2 : f32
-    }) : (tensor<32x32x1xf32, #blocked>) -> tensor<32x32xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>>
-    %3 = triton_gpu.convert_layout %1 : tensor<32x32xf32, #triton_gpu.slice<{dim = 2, parent = #blocked}>> -> tensor<32x32xf32, #blocked2>
-    %4 = "tt.reduce"(%3) <{axis = 0 : i32}> ({
-    ^bb0(%arg3: f32, %arg4: f32):
-      %5 = arith.maxnumf %arg3, %arg4 : f32
-      tt.reduce.return %5 : f32
-    }) : (tensor<32x32xf32, #blocked2>) -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
-    %6 = triton_gpu.convert_layout %4 : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> -> tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
-    tt.return %6 : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #mma}>>
+      %7 = arith.addf %arg1, %arg2 : f32
+      tt.reduce.return %7 : f32
+    }) : (tensor<32x32xf32, #blocked2>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
+    %6 = triton_gpu.convert_layout %5 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
+    tt.return %6 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
   }
 }
     ```