Skip to content

Commit 84ced0e

Browse files
authored
Revert "Allow Layouts to propogate to local_load" (#5237)
This is causing some performance regression. I'll investigate and reland it. Reverts triton-lang/triton#5219
1 parent 340cbc6 commit 84ced0e

File tree

2 files changed

+1
-20
lines changed

2 files changed

+1
-20
lines changed

lib/Dialect/TritonGPU/Transforms/Utility.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -563,8 +563,7 @@ bool canFoldIntoConversion(Operation *op, Attribute targetEncoding) {
563563
}
564564
return isa<triton::gpu::ConvertLayoutOp, arith::ConstantOp,
565565
triton::MakeRangeOp, triton::SplatOp, triton::HistogramOp,
566-
triton::gpu::LocalAllocOp, triton::gpu::LocalLoadOp,
567-
triton::gpu::LocalStoreOp>(op);
566+
triton::gpu::LocalAllocOp, triton::gpu::LocalStoreOp>(op);
568567
}
569568

570569
scf::ForOp replaceForOpWithNewSignature(

test/TritonGPU/combine.mlir

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2685,21 +2685,3 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
26852685
tt.return
26862686
}
26872687
}
2688-
2689-
// -----
2690-
2691-
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [2, 1, 16, 1, 1], warpsPerCTA = [1, 1, 2, 2, 1], order = [4, 0, 1, 2, 3]}>
2692-
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [1, 1, 32, 1, 1], warpsPerCTA = [1, 1, 1, 1, 4], order = [4, 3, 2, 1, 0]}>
2693-
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [2, 1, 16, 1, 1], warpsPerCTA = [1, 2, 2, 1, 1], order = [4, 0, 3, 2, 1]}>
2694-
#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [4, 0, 1, 2, 3], hasLeadingOffset = false}>
2695-
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:100", "triton_gpu.threads-per-warp" = 32 : i32} {
2696-
// CHECK-LABEL: lift_convert_to_local_load
2697-
// CHECK-NOT: convert_layout
2698-
// CHECK: tt.return
2699-
tt.func public @lift_convert_to_local_load(%arg0 : !triton_gpu.memdesc<2x1x32x4x4xi8, #shared, #triton_gpu.shared_memory, mutable>) -> tensor<2x4x32x1x4xi8, #blocked2> {
2700-
%1 = triton_gpu.local_load %arg0 : !triton_gpu.memdesc<2x1x32x4x4xi8, #shared, #triton_gpu.shared_memory, mutable> -> tensor<2x1x32x4x4xi8, #blocked>
2701-
%2 = tt.trans %1 {order = array<i32: 0, 3, 2, 1, 4>} : tensor<2x1x32x4x4xi8, #blocked> -> tensor<2x4x32x1x4xi8, #blocked1>
2702-
%3 = triton_gpu.convert_layout %2 : tensor<2x4x32x1x4xi8, #blocked1> -> tensor<2x4x32x1x4xi8, #blocked2>
2703-
tt.return %3 : tensor<2x4x32x1x4xi8, #blocked2>
2704-
}
2705-
}

0 commit comments

Comments
 (0)