@@ -375,52 +375,4 @@ tt.func @test(%arg0: tensor<16x32xf32, #mma>) -> tensor<16xf32, #ttg.slice<{dim
375375 "mlir::triton::gpu::TritonGPUDialect"];
376376}
377377
378- def TritonIntelGPUOptimizeElementwiseParallelism
379- : Pass<"tritonintelgpu-optimize-elementwise-parallelism", "mlir::ModuleOp"> {
380- let summary =
381- "Improve parallelism of elementwise operations better utilizing hardware resources.";
382-
383- let description = [{
384- Detect elementwise operations with an encoding causing sub-par parallelism,
385- i.e., with data duplication across threads, and convert the operands to a
386- more optimal encoding if the cost of doing so is heuristically estimated to
387- be sufficiently low. As of now, the cost should be 0, we only support
388- "unbroadcasting" tensors, i.e., dropping duplicated values held in other
389- threads by re-distributing them.
390-
391- As an example, this pass would modify the following code:
392- ```mlir
393- #blocked = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
394-
395- module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
396- tt.func @test_blocked(%arg0: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> {
397- %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
398- tt.return %0 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
399- }
400- }
401- ```
402- Obtaining:
403- ```mlir
404- #blocked = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
405- #blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
406-
407- module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
408- tt.func @test_blocked(%arg0: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> {
409- %0 = ttg.convert_layout %arg0 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
410- %1 = ttg.convert_layout %arg1 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
411- %2 = arith.addf %0, %1 : tensor<16xf32, #blocked1>
412- %3 = ttg.convert_layout %2 : tensor<16xf32, #blocked1> -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
413- tt.return %3 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
414- }
415- }
416- ```
417-
418- Note how the converted tensors are not sliced and thus each element in the
419- tensor is held by a single thread.
420- }];
421-
422- let dependentDialects = [];
423- }
424-
425-
426378#endif // TRITON_INTEL_GPU_PASSES
0 commit comments