intel
diff --git a/‎test/TritonIntelGPU/optimize-elementwise.mlir‎
Lines changed: 0 additions & 153 deletions b/‎test/TritonIntelGPU/optimize-elementwise.mlir‎
Lines changed: 0 additions & 153 deletions
diff --git a/‎third_party/intel/backend/compiler.py‎
Lines changed: 0 additions & 1 deletion b/‎third_party/intel/backend/compiler.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td‎
Lines changed: 0 additions & 48 deletions b/‎third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td‎
Lines changed: 0 additions & 48 deletions
diff --git a/‎third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
@@ -255,7 +255,6 @@ def make_ttgir(mod, metadata, opt, properties):
         passes.ttgpuir.add_optimize_dot_operands(pm, True)
         if os.getenv("TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY", "0") == "1":
             intel.passes.ttgpuir.add_optimize_reduction_locality(pm)
-            intel.passes.ttgpuir.add_optimize_elementwise_parallelism(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_reduce_data_duplication(pm)
         passes.ttgpuir.add_reorder_instructions(pm)
 
@@ -375,52 +375,4 @@ tt.func @test(%arg0: tensor<16x32xf32, #mma>) -> tensor<16xf32, #ttg.slice<{dim
                            "mlir::triton::gpu::TritonGPUDialect"];
 }
 
-def TritonIntelGPUOptimizeElementwiseParallelism
-    : Pass<"tritonintelgpu-optimize-elementwise-parallelism", "mlir::ModuleOp"> {
-  let summary =
-      "Improve parallelism of elementwise operations better utilizing hardware resources.";
-
-  let description = [{
-    Detect elementwise operations with an encoding causing sub-par parallelism,
-    i.e., with data duplication across threads, and convert the operands to a
-    more optimal encoding if the cost of doing so is heuristically estimated to
-    be sufficiently low. As of now, the cost should be 0, we only support
-    "unbroadcasting" tensors, i.e., dropping duplicated values held in other
-    threads by re-distributing them.
-
-    As an example, this pass would modify the following code:
-```mlir
-#blocked = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
-
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
-  tt.func @test_blocked(%arg0: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> {
-    %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
-    tt.return %0 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
-  }
-}
-```
-    Obtaining:
-```mlir
-#blocked = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
-#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
-
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
-  tt.func @test_blocked(%arg0: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> {
-    %0 = ttg.convert_layout %arg0 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
-    %1 = ttg.convert_layout %arg1 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
-    %2 = arith.addf %0, %1 : tensor<16xf32, #blocked1>
-    %3 = ttg.convert_layout %2 : tensor<16xf32, #blocked1> -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
-    tt.return %3 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
-  }
-}
-```
-
-  Note how the converted tensors are not sliced and thus each element in the
-  tensor is held by a single thread.
-  }];
-
-  let dependentDialects = [];
-}
-
-
 #endif // TRITON_INTEL_GPU_PASSES
@@ -4,7 +4,6 @@ add_triton_library(TritonIntelGPUTransforms
   DistributeToWarps.cpp
   MatchTargetSize.cpp
   MaterializeBlockPointer.cpp
-  OptimizeElementwiseParallelism.cpp
   OptimizeReductionLocality.cpp
   Pipeliner/MatmulLoopPipeline.cpp
   Pipeliner/SoftwarePipeliner.cpp