Skip to content

Commit 123f8f1

Browse files
committed
Integrate
1 parent 5fa5ae6 commit 123f8f1

File tree

3 files changed

+16
-1
lines changed

3 files changed

+16
-1
lines changed

third_party/intel/backend/compiler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ def make_ttgir(mod, metadata, opt, properties):
245245
passes.common.add_cse(pm)
246246
passes.ttgpuir.add_prefetch(pm)
247247
passes.ttgpuir.add_optimize_dot_operands(pm, True)
248+
intel.passes.ttgpuir.add_optimize_reduction_locality(pm)
248249
intel.passes.ttgpuir.add_remove_layout_conversions(pm)
249250
intel.passes.ttgpuir.add_reduce_data_duplication(pm)
250251
passes.ttgpuir.add_reorder_instructions(pm)

third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -486,8 +486,20 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
486486
constexpr std::size_t laneIndex = 0;
487487
constexpr std::size_t registerIndex = 1;
488488
int32_t size = conversion->getInDimSize(kLane);
489+
std::vector<std::vector<int32_t>> registerBases =
490+
buildBasis(size, registerIndex);
491+
{
492+
// Populate register bases for N > 8.
493+
std::vector<int32_t> base(2);
494+
for (int32_t i = registerBases.back()[registerIndex] * 2,
495+
n = conversion->getInDimSize(kRegister); i < n;
496+
i *= 2) {
497+
base.front() = i;
498+
registerBases.push_back(base);
499+
}
500+
}
489501
std::array<std::pair<StringAttr, std::vector<std::vector<int32_t>>>, 2>
490-
bases{{{kRegister, buildBasis(size, registerIndex)},
502+
bases{{{kRegister, std::move(registerBases)},
491503
{kLane, buildBasis(size, laneIndex)}}};
492504
std::array<StringAttr, 2> outDimNames{kRegister, kLane};
493505
return conversion == LinearLayout(bases, outDimNames);

third_party/intel/triton_xpu.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) {
9999
gpu::intel::createTritonIntelGPUReduceDataDuplication);
100100
ADD_PASS_WRAPPER_0("add_materialize_block_pointer",
101101
gpu::intel::createTritonIntelGPUMaterializeBlockPointer);
102+
ADD_PASS_WRAPPER_0("add_optimize_reduction_locality",
103+
gpu::intel::createTritonIntelGPUOptimizeReductionLocality);
102104
}
103105

104106
void init_triton_intel(py::module &&m) {

0 commit comments

Comments
 (0)