From fc83953c396d862a6ec95f6a93428bbc47f0a502 Mon Sep 17 00:00:00 2001 From: victor-eds Date: Thu, 24 Oct 2024 12:29:09 +0100 Subject: [PATCH] [XPU] Conditionally add `-tritonintelgpu-optimize-reduction-locality` to pipeline Add the `-tritonintelgpu-optimize-reduction-locality` pass to the pipeline if the `TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY` is set to 1. As shown in #2266, this pass gives quite promising results, although there is still room for improvement. Conditionally enabling it will greatly help performance investigation. Signed-off-by: victor-eds --- third_party/intel/backend/compiler.py | 2 ++ third_party/intel/triton_xpu.cc | 2 ++ 2 files changed, 4 insertions(+) diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 86948112b9..a13834f991 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -245,6 +245,8 @@ def make_ttgir(mod, metadata, opt, properties): passes.common.add_cse(pm) passes.ttgpuir.add_prefetch(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) + if os.getenv("TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY", "0") == 1: + intel.passes.ttgpuir.add_optimize_reduction_locality(pm) intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_reduce_data_duplication(pm) passes.ttgpuir.add_reorder_instructions(pm) diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index 201ec17a74..82d405fce0 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -99,6 +99,8 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) { gpu::intel::createTritonIntelGPUReduceDataDuplication); ADD_PASS_WRAPPER_0("add_materialize_block_pointer", gpu::intel::createTritonIntelGPUMaterializeBlockPointer); + ADD_PASS_WRAPPER_0("add_optimize_reduction_locality", + gpu::intel::createTritonIntelGPUOptimizeReductionLocality); } void init_triton_intel(py::module &&m) {