diff --git a/test/TritonIntelGPU/blockptr_store.mlir b/test/TritonIntelGPU/blockptr_store.mlir index c4e2ef38cf..81397ae757 100644 --- a/test/TritonIntelGPU/blockptr_store.mlir +++ b/test/TritonIntelGPU/blockptr_store.mlir @@ -1,5 +1,4 @@ // RUN: triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm -// RUN: env TRITON_INTEL_ENABLE_BLOCK_IO_ALL_LAYOUTS=1 triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=ALL-LAYOUT #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [4, 4], repCluster = [2, 2]}> #dot_a = #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}> @@ -11,25 +10,25 @@ module attributes {ttig.support_sg_2d_block, "ttg.num-warps" = 16 : i32, "ttg.t %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 %0 = tt.make_tensor_ptr %arg0, [%c64_i64, %c64_i64], [%c1_i64, %col_stride], [%c0_i32, %c0_i32] {order = array} : > - // ALL-LAYOUT: %[[OFF_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[OFF_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[HEIGHT_i64:.*]] = llvm.extractvalue {{.*}}[2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[WIDTH_i64:.*]] = llvm.extractvalue {{.*}}[3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[COL_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[BASE_PTR:.*]] = llvm.extractvalue {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - - // ALL-LAYOUT: %[[HEIGHT:.*]] = llvm.trunc %[[HEIGHT_i64]] : i64 to i32 - - // ALL-LAYOUT: %[[OFFSET:.*]] = llvm.add %[[OFF_0]], {{.*}} : i32 - // ALL-LAYOUT: %[[BASE:.*]] = llvm.getelementptr %[[BASE_PTR]]{{.*}} : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8 - // ALL-LAYOUT: %[[OFFSET_X:.*]] = llvm.mlir.constant(0 : i32) : i32 - // ALL-LAYOUT: %[[OFFSET_Y:.*]] = llvm.select {{.*}}, %[[OFFSET]], %[[HEIGHT]] : i1, i32 - // ALL-LAYOUT: llvm.mlir.undef : vector<4xi8> - // ALL-LAYOUT-COUNT-4: llvm.insertelement %{{[0-9]+}}, %{{[0-9]+}}{{\[}}{{.*}} : i32] : vector<4xi8> - // ALL-LAYOUT: triton_gen.2Dblockstore {{.*}}, %[[OFFSET_X]], %[[OFFSET_Y]], {{.*}} {elem_size_in_bits = 8, tile_width = 8, tile_height = 8, v_blocks = 1, cache_control = Default} + // CHECK: %[[OFF_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[OFF_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[HEIGHT_i64:.*]] = llvm.extractvalue {{.*}}[2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[WIDTH_i64:.*]] = llvm.extractvalue {{.*}}[3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[COL_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[BASE_PTR:.*]] = llvm.extractvalue {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + + // CHECK: %[[HEIGHT:.*]] = llvm.trunc %[[HEIGHT_i64]] : i64 to i32 + + // CHECK: %[[OFFSET:.*]] = llvm.add %[[OFF_0]], {{.*}} : i32 + // CHECK: %[[BASE:.*]] = llvm.getelementptr %[[BASE_PTR]]{{.*}} : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8 + // CHECK: %[[OFFSET_X:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: %[[OFFSET_Y:.*]] = llvm.select {{.*}}, %[[OFFSET]], %[[HEIGHT]] : i1, i32 + // CHECK: llvm.mlir.undef : vector<4xi8> + // CHECK-COUNT-4: llvm.insertelement %{{[0-9]+}}, %{{[0-9]+}}{{\[}}{{.*}} : i32] : vector<4xi8> + // CHECK: triton_gen.2Dblockstore {{.*}}, %[[OFFSET_X]], %[[OFFSET_Y]], {{.*}} {elem_size_in_bits = 8, tile_width = 8, tile_height = 8, v_blocks = 1, cache_control = Default} tt.store %0, %cst {ttig.block_io = "row_major", boundaryCheck = array} : !tt.ptr> - // ALL-LAYOUT-COUNT-63: triton_gen.2Dblockstore {{.*}} {elem_size_in_bits = 8, tile_width = 8, tile_height = 8, v_blocks = 1, cache_control = Default} + // CHECK-COUNT-63: triton_gen.2Dblockstore {{.*}} {elem_size_in_bits = 8, tile_width = 8, tile_height = 8, v_blocks = 1, cache_control = Default} tt.return } @@ -47,25 +46,25 @@ module attributes {ttig.support_sg_2d_block, "ttg.num-warps" = 16 : i32, "ttg.t %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 %0 = tt.make_tensor_ptr %arg0, [%c64_i64, %c64_i64], [%c1_i64, %col_stride], [%c0_i32, %c0_i32] {order = array} : > - // ALL-LAYOUT: %[[OFF_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[OFF_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[HEIGHT_i64:.*]] = llvm.extractvalue {{.*}}[2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[WIDTH_i64:.*]] = llvm.extractvalue {{.*}}[3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[COL_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[BASE_PTR:.*]] = llvm.extractvalue {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - - // ALL-LAYOUT: %[[HEIGHT:.*]] = llvm.trunc %[[HEIGHT_i64]] : i64 to i32 - - // ALL-LAYOUT: %[[OFFSET:.*]] = llvm.add %[[OFF_0]], {{.*}} : i32 - // ALL-LAYOUT: %[[BASE:.*]] = llvm.getelementptr %[[BASE_PTR]]{{.*}} : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8 - // ALL-LAYOUT: %[[OFFSET_X:.*]] = llvm.mlir.constant(0 : i32) : i32 - // ALL-LAYOUT: %[[OFFSET_Y:.*]] = llvm.select {{.*}}, %[[OFFSET]], %[[HEIGHT]] : i1, i32 - // ALL-LAYOUT: llvm.mlir.undef : vector<8xi8> - // ALL-LAYOUT-COUNT-8: llvm.insertelement %{{[0-9]+}}, %{{[0-9]+}}{{\[}}{{.*}} : i32] : vector<8xi8> - // ALL-LAYOUT: triton_gen.2Dblockstore {{.*}}, %[[OFFSET_X]], %[[OFFSET_Y]], {{.*}} {elem_size_in_bits = 8, tile_width = 16, tile_height = 8, v_blocks = 1, cache_control = Default} + // CHECK: %[[OFF_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[OFF_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[HEIGHT_i64:.*]] = llvm.extractvalue {{.*}}[2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[WIDTH_i64:.*]] = llvm.extractvalue {{.*}}[3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[COL_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[BASE_PTR:.*]] = llvm.extractvalue {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + + // CHECK: %[[HEIGHT:.*]] = llvm.trunc %[[HEIGHT_i64]] : i64 to i32 + + // CHECK: %[[OFFSET:.*]] = llvm.add %[[OFF_0]], {{.*}} : i32 + // CHECK: %[[BASE:.*]] = llvm.getelementptr %[[BASE_PTR]]{{.*}} : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8 + // CHECK: %[[OFFSET_X:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: %[[OFFSET_Y:.*]] = llvm.select {{.*}}, %[[OFFSET]], %[[HEIGHT]] : i1, i32 + // CHECK: llvm.mlir.undef : vector<8xi8> + // CHECK-COUNT-8: llvm.insertelement %{{[0-9]+}}, %{{[0-9]+}}{{\[}}{{.*}} : i32] : vector<8xi8> + // CHECK: triton_gen.2Dblockstore {{.*}}, %[[OFFSET_X]], %[[OFFSET_Y]], {{.*}} {elem_size_in_bits = 8, tile_width = 16, tile_height = 8, v_blocks = 1, cache_control = Default} tt.store %0, %cst {ttig.block_io = "row_major", boundaryCheck = array} : !tt.ptr> - // ALL-LAYOUT-COUNT-63: triton_gen.2Dblockstore {{.*}} {elem_size_in_bits = 8, tile_width = 16, tile_height = 8, v_blocks = 1, cache_control = Default} + // CHECK-COUNT-63: triton_gen.2Dblockstore {{.*}} {elem_size_in_bits = 8, tile_width = 16, tile_height = 8, v_blocks = 1, cache_control = Default} tt.return } @@ -83,25 +82,25 @@ module attributes {ttig.support_sg_2d_block, "ttg.num-warps" = 16 : i32, "ttg.t %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 %0 = tt.make_tensor_ptr %arg0, [%c64_i64, %c64_i64], [%c1_i64, %col_stride], [%c0_i32, %c0_i32] {order = array} : > - // ALL-LAYOUT: %[[OFF_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[OFF_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[HEIGHT_i64:.*]] = llvm.extractvalue {{.*}}[2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[WIDTH_i64:.*]] = llvm.extractvalue {{.*}}[3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[COL_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[BASE_PTR:.*]] = llvm.extractvalue {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - - // ALL-LAYOUT: %[[HEIGHT:.*]] = llvm.trunc %[[HEIGHT_i64]] : i64 to i32 - - // ALL-LAYOUT: %[[OFFSET:.*]] = llvm.add %[[OFF_0]], {{.*}} : i32 - // ALL-LAYOUT: %[[BASE:.*]] = llvm.getelementptr %[[BASE_PTR]]{{.*}} : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8 - // ALL-LAYOUT: %[[OFFSET_X:.*]] = llvm.mlir.constant(0 : i32) : i32 - // ALL-LAYOUT: %[[OFFSET_Y:.*]] = llvm.select {{.*}}, %[[OFFSET]], %[[HEIGHT]] : i1, i32 - // ALL-LAYOUT: llvm.mlir.undef : vector<16xi8> - // ALL-LAYOUT-COUNT-16: llvm.insertelement %{{[0-9]+}}, %{{[0-9]+}}{{\[}}{{.*}} : i32] : vector<16xi8> - // ALL-LAYOUT: triton_gen.2Dblockstore {{.*}}, %[[OFFSET_X]], %[[OFFSET_Y]], {{.*}} {elem_size_in_bits = 16, tile_width = 32, tile_height = 8, v_blocks = 1, cache_control = Default} + // CHECK: %[[OFF_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[OFF_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[HEIGHT_i64:.*]] = llvm.extractvalue {{.*}}[2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[WIDTH_i64:.*]] = llvm.extractvalue {{.*}}[3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[COL_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[BASE_PTR:.*]] = llvm.extractvalue {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + + // CHECK: %[[HEIGHT:.*]] = llvm.trunc %[[HEIGHT_i64]] : i64 to i32 + + // CHECK: %[[OFFSET:.*]] = llvm.add %[[OFF_0]], {{.*}} : i32 + // CHECK: %[[BASE:.*]] = llvm.getelementptr %[[BASE_PTR]]{{.*}} : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8 + // CHECK: %[[OFFSET_X:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: %[[OFFSET_Y:.*]] = llvm.select {{.*}}, %[[OFFSET]], %[[HEIGHT]] : i1, i32 + // CHECK: llvm.mlir.undef : vector<16xi8> + // CHECK-COUNT-16: llvm.insertelement %{{[0-9]+}}, %{{[0-9]+}}{{\[}}{{.*}} : i32] : vector<16xi8> + // CHECK: triton_gen.2Dblockstore {{.*}}, %[[OFFSET_X]], %[[OFFSET_Y]], {{.*}} {elem_size_in_bits = 16, tile_width = 32, tile_height = 8, v_blocks = 1, cache_control = Default} tt.store %0, %cst {ttig.block_io = "row_major", boundaryCheck = array} : !tt.ptr> - // ALL-LAYOUT-COUNT-31: triton_gen.2Dblockstore {{.*}} {elem_size_in_bits = 16, tile_width = 32, tile_height = 8, v_blocks = 1, cache_control = Default} + // CHECK-COUNT-31: triton_gen.2Dblockstore {{.*}} {elem_size_in_bits = 16, tile_width = 32, tile_height = 8, v_blocks = 1, cache_control = Default} tt.return } @@ -118,25 +117,25 @@ module attributes {ttig.support_sg_2d_block, "ttg.num-warps" = 16 : i32, "ttg.t %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 %0 = tt.make_tensor_ptr %arg0, [%c64_i64, %c64_i64], [%c1_i64, %col_stride], [%c0_i32, %c0_i32] {order = array} : > - // ALL-LAYOUT: %[[OFF_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[OFF_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[HEIGHT_i64:.*]] = llvm.extractvalue {{.*}}[2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[WIDTH_i64:.*]] = llvm.extractvalue {{.*}}[3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[COL_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // ALL-LAYOUT: %[[BASE_PTR:.*]] = llvm.extractvalue {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - - // ALL-LAYOUT: %[[HEIGHT:.*]] = llvm.trunc %[[HEIGHT_i64]] : i64 to i32 - - // ALL-LAYOUT: %[[OFFSET:.*]] = llvm.add %[[OFF_0]], {{.*}} : i32 - // ALL-LAYOUT: %[[BASE:.*]] = llvm.getelementptr %[[BASE_PTR]]{{.*}} : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8 - // ALL-LAYOUT: %[[OFFSET_X:.*]] = llvm.mlir.constant(0 : i32) : i32 - // ALL-LAYOUT: %[[OFFSET_Y:.*]] = llvm.select {{.*}}, %[[OFFSET]], %[[HEIGHT]] : i1, i32 - // ALL-LAYOUT: llvm.mlir.undef : vector<8xi8> - // ALL-LAYOUT-COUNT-8: llvm.insertelement %{{[0-9]+}}, %{{[0-9]+}}{{\[}}{{.*}} : i32] : vector<8xi8> - // ALL-LAYOUT: triton_gen.2Dblockstore {{.*}}, %[[OFFSET_X]], %[[OFFSET_Y]], {{.*}} {elem_size_in_bits = 16, tile_width = 32, tile_height = 4, v_blocks = 1, cache_control = Default} + // CHECK: %[[OFF_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[OFF_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[HEIGHT_i64:.*]] = llvm.extractvalue {{.*}}[2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[WIDTH_i64:.*]] = llvm.extractvalue {{.*}}[3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[COL_STRIDE_i64:.*]] = llvm.extractvalue {{.*}}[5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + // CHECK: %[[BASE_PTR:.*]] = llvm.extractvalue {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> + + // CHECK: %[[HEIGHT:.*]] = llvm.trunc %[[HEIGHT_i64]] : i64 to i32 + + // CHECK: %[[OFFSET:.*]] = llvm.add %[[OFF_0]], {{.*}} : i32 + // CHECK: %[[BASE:.*]] = llvm.getelementptr %[[BASE_PTR]]{{.*}} : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8 + // CHECK: %[[OFFSET_X:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: %[[OFFSET_Y:.*]] = llvm.select {{.*}}, %[[OFFSET]], %[[HEIGHT]] : i1, i32 + // CHECK: llvm.mlir.undef : vector<8xi8> + // CHECK-COUNT-8: llvm.insertelement %{{[0-9]+}}, %{{[0-9]+}}{{\[}}{{.*}} : i32] : vector<8xi8> + // CHECK: triton_gen.2Dblockstore {{.*}}, %[[OFFSET_X]], %[[OFFSET_Y]], {{.*}} {elem_size_in_bits = 16, tile_width = 32, tile_height = 4, v_blocks = 1, cache_control = Default} tt.store %0, %cst {ttig.block_io = "row_major", boundaryCheck = array} : !tt.ptr> - // ALL-LAYOUT-COUNT-7: triton_gen.2Dblockstore {{.*}} {elem_size_in_bits = 16, tile_width = 32, tile_height = 4, v_blocks = 1, cache_control = Default} + // CHECK-COUNT-7: triton_gen.2Dblockstore {{.*}} {elem_size_in_bits = 16, tile_width = 32, tile_height = 4, v_blocks = 1, cache_control = Default} tt.return } diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index 4e29a30202..5afe3abf2f 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -2677,9 +2677,11 @@ struct StoreOpToBlockIOConversion LogicalResult matchAndRewrite(triton::StoreOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const final { - static const bool enableBlockIOForAllLayout = - triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_IO_ALL_LAYOUTS"); - if (!isBlockIOCandidate(op, enableBlockIOForAllLayout)) + std::optional enableBlockIOForAllLayout = + mlir::triton::tools::isEnvValueBool(mlir::triton::tools::getStrEnv( + "TRITON_INTEL_ENABLE_BLOCK_IO_ALL_LAYOUTS")); + if (!isBlockIOCandidate(op, !enableBlockIOForAllLayout.has_value() || + enableBlockIOForAllLayout.value())) return failure(); Location loc = op.getLoc();