Skip to content

Commit ba7a1c3

Browse files
authored
[XPU] Drop -tritonintelgpu-optimize-elementwise-locality pass (#2877)
Drop pass no longer needed as the layout it's trying to avoid should have been avoided by previous passes and propagated by layout anchoring and layout conversion elimination. Signed-off-by: victor-eds <[email protected]>
1 parent c42f4a7 commit ba7a1c3

File tree

6 files changed

+0
-395
lines changed

6 files changed

+0
-395
lines changed

test/TritonIntelGPU/optimize-elementwise.mlir

Lines changed: 0 additions & 153 deletions
This file was deleted.

third_party/intel/backend/compiler.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,6 @@ def make_ttgir(mod, metadata, opt, properties):
255255
passes.ttgpuir.add_optimize_dot_operands(pm, True)
256256
if os.getenv("TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY", "0") == "1":
257257
intel.passes.ttgpuir.add_optimize_reduction_locality(pm)
258-
intel.passes.ttgpuir.add_optimize_elementwise_parallelism(pm)
259258
intel.passes.ttgpuir.add_remove_layout_conversions(pm)
260259
intel.passes.ttgpuir.add_reduce_data_duplication(pm)
261260
passes.ttgpuir.add_reorder_instructions(pm)

third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -375,52 +375,4 @@ tt.func @test(%arg0: tensor<16x32xf32, #mma>) -> tensor<16xf32, #ttg.slice<{dim
375375
"mlir::triton::gpu::TritonGPUDialect"];
376376
}
377377

378-
def TritonIntelGPUOptimizeElementwiseParallelism
379-
: Pass<"tritonintelgpu-optimize-elementwise-parallelism", "mlir::ModuleOp"> {
380-
let summary =
381-
"Improve parallelism of elementwise operations better utilizing hardware resources.";
382-
383-
let description = [{
384-
Detect elementwise operations with an encoding causing sub-par parallelism,
385-
i.e., with data duplication across threads, and convert the operands to a
386-
more optimal encoding if the cost of doing so is heuristically estimated to
387-
be sufficiently low. As of now, the cost should be 0, we only support
388-
"unbroadcasting" tensors, i.e., dropping duplicated values held in other
389-
threads by re-distributing them.
390-
391-
As an example, this pass would modify the following code:
392-
```mlir
393-
#blocked = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
394-
395-
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
396-
tt.func @test_blocked(%arg0: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> {
397-
%0 = arith.addf %arg0, %arg1 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
398-
tt.return %0 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
399-
}
400-
}
401-
```
402-
Obtaining:
403-
```mlir
404-
#blocked = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
405-
#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
406-
407-
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
408-
tt.func @test_blocked(%arg0: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> {
409-
%0 = ttg.convert_layout %arg0 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
410-
%1 = ttg.convert_layout %arg1 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
411-
%2 = arith.addf %0, %1 : tensor<16xf32, #blocked1>
412-
%3 = ttg.convert_layout %2 : tensor<16xf32, #blocked1> -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
413-
tt.return %3 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
414-
}
415-
}
416-
```
417-
418-
Note how the converted tensors are not sliced and thus each element in the
419-
tensor is held by a single thread.
420-
}];
421-
422-
let dependentDialects = [];
423-
}
424-
425-
426378
#endif // TRITON_INTEL_GPU_PASSES

third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ add_triton_library(TritonIntelGPUTransforms
44
DistributeToWarps.cpp
55
MatchTargetSize.cpp
66
MaterializeBlockPointer.cpp
7-
OptimizeElementwiseParallelism.cpp
87
OptimizeReductionLocality.cpp
98
Pipeliner/MatmulLoopPipeline.cpp
109
Pipeliner/SoftwarePipeliner.cpp

0 commit comments

Comments
 (0)