diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp index 5efa2002de..c640d50215 100644 --- a/include/triton/Tools/Sys/GetEnv.hpp +++ b/include/triton/Tools/Sys/GetEnv.hpp @@ -44,6 +44,7 @@ inline const std::set CACHE_INVALIDATING_ENV_VARS = { "TRITON_F32_DEFAULT", "TRITON_PREFER_TMEM_16x256_LAYOUT", "TRITON_ENABLE_EXPERIMENTAL_CONSAN", + "TRITON_INTEL_2DBLOCK_ASSERT", "TRITON_INTEL_AGGRESSIVE_DPAS_REUSE", "TRITON_INTEL_ENABLE_BLOCK_IO_ALL_LAYOUTS", "TRITON_INTEL_ENABLE_DPAS_FOR_WARP_SIZE_32", diff --git a/python/test/unit/intel/block_load_helper.py b/python/test/unit/intel/block_load_helper.py new file mode 100644 index 0000000000..22d5ed3ee0 --- /dev/null +++ b/python/test/unit/intel/block_load_helper.py @@ -0,0 +1,55 @@ +import torch +import triton + +import ctypes +import sys + + +def run_load_ir(temp_file, elem_size, *args): + out_type = f"i{int(elem_size) * 4}" + ir = f""" + module attributes {{ + ttg.target = "xpu", + "ttg.num-warps" = 32 : i32, + "ttg.num-ctas" = 1 : i32, + "ttg.threads-per-warp" = 16 : i32 + }} {{ + tt.func @dyn_block( + %iptr : i64, %base_width : i32, + %base_height : i32, %base_pitch : i32, + %x : i32, %y : i32) {{ + %p0 = llvm.inttoptr %iptr : i64 to !llvm.ptr + + %v = triton_gen.2Dblockload %p0, %base_width, %base_height, + %base_pitch, %x, %y + {{ elem_size_in_bits = {elem_size}, tile_width = 8, tile_height = 8, + v_blocks = 1, transpose = false, + vnni_transform = false, cache_control = Default }} + : (!llvm.ptr, i32, i32, i32, i32, i32) + -> vector<1x{out_type}> + + // prevent GluonInline + %v_cast = llvm.bitcast %v : vector<1x{out_type}> to {out_type} + llvm.inline_asm has_side_effects asm_dialect = att + "", "r" %v_cast : ({out_type}) -> () + + tt.return + }} + }} + """ + + with open(temp_file, "w", encoding="utf-8") as f: + f.write(ir) + + kernel = triton.compile(temp_file) + + a = torch.zeros((256, 64), dtype=torch.float32, device="xpu") + + addr = ctypes.c_int64(a.data_ptr()).value + + kernel[(1, 1, 1)](addr, *map(int, args), 0) + + +if __name__ == "__main__": + fn = globals()[sys.argv[1]] + fn(*sys.argv[2:]) diff --git a/python/test/unit/intel/test_block_load.py b/python/test/unit/intel/test_block_load.py index 4d4144946b..de74417863 100644 --- a/python/test/unit/intel/test_block_load.py +++ b/python/test/unit/intel/test_block_load.py @@ -1,5 +1,10 @@ import pytest import torch + +import os +import signal +import subprocess +import sys import pathlib from functools import partial @@ -207,3 +212,45 @@ def triton_mm(X, Y, b=None, transpose_x=False, transpose_y=False): result_tor = fn_tor() result_tri = fn_tri() torch.testing.assert_close(result_tri, result_tor, atol=1e-2, rtol=1e-3) + + +@pytest.mark.parametrize("elem_size, width, height, pitch, x", + [[8, 16777216, 64, 16777216, 0], # width <= 24 bits + [8, 32, 64, 128, 0], # width >= 64 + [8, 66, 64, 128, 0], # width % max(4,elemSize) == 0 + [8, 128, 16777216, 128, 0], # height <= 24 bits + [8, 128, 64, 16777216, 0], # pitch <= 24 bits + [8, 128, 64, 32, 0], # pitch >= 64 + [8, 128, 64, 70, 0], # pitch % 16 == 0 + [8, 128, 64, 120, 0], # pitch >= width + [8, 128, 64, 128, 1], # x*elemSize % 4 == 0 (alignment for 8-bit) + [16, 128, 64, 128, 1], # x*elemSize % 4 == 0 (alignment for 16-bit) + ]) +@pytest.mark.skipif(not is_xpu(), reason="Block load tests are specific to the XPU backend") +@pytest.mark.xfail( + not (torch.xpu.get_device_capability()['has_subgroup_2d_block_io'] + and torch.xpu.get_device_capability()['has_subgroup_matrix_multiply_accumulate']), + reason="Block loads and/or DPAS not supported on this architecture", run=False) +def test_block_load_asserts(elem_size, width, height, pitch, x, monkeypatch, tmp_path: pathlib.Path): + monkeypatch.setenv("TRITON_INTEL_2DBLOCK_ASSERT", "1") + + dir_path = os.path.dirname(os.path.realpath(__file__)) + helper_path = os.path.join(dir_path, "block_load_helper.py") + + temp_file = tmp_path / "test_block_load_asserts.ttgir" + + proc = subprocess.run( + [ + sys.executable, helper_path, "run_load_ir", + str(temp_file), + str(elem_size), + str(width), + str(height), + str(pitch), + str(x) + ], + capture_output=True, + ) + + rc = proc.returncode + assert rc == -signal.SIGABRT diff --git a/test/Triton/Intel/FuseReshape/fuse-reshape.mlir b/test/Triton/Intel/FuseReshape/fuse-reshape.mlir deleted file mode 100644 index e18aeeba35..0000000000 --- a/test/Triton/Intel/FuseReshape/fuse-reshape.mlir +++ /dev/null @@ -1,196 +0,0 @@ -// RUN: triton-opt %s -split-input-file -triton-intel-fuse-reshape | FileCheck %s - -// COM: tt.load -> tt.reshape -> tt.dot chain, not in a loop. -tt.func public @fuseLoadWithReshape1(%arg0: !tt.ptr>, %arg1: !tt.ptr) { - %c0_i32 = arith.constant 0 : i32 - %c1_i32 = arith.constant 1 : i32 - %c2_i32 = arith.constant 2 : i32 - %c1_i64 = arith.constant 1 : i64 - %c4_i64 = arith.constant 4 : i64 - %c64_i64 = arith.constant 4 : i64 - %c1024_i64 = arith.constant 1024 : i64 - %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32> - %0 = tt.make_tensor_ptr %arg1, [%c1_i64, %c64_i64, %c1024_i64], [%c1024_i64, %c4_i64, %c1_i64], [%c2_i32, %c1_i32, %c0_i32] {order = array} : > - %1 = tt.load %arg0 {boundaryCheck = array} : !tt.ptr> - %3 = tt.load %0 {boundaryCheck = array} : !tt.ptr> - %4 = tt.reshape %3 : tensor<1x32x256xbf16> -> tensor<32x256xbf16> - %5 = tt.dot %1, %4, %cst, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32> - tt.return -} -// CHECK-LABEL: fuseLoadWithReshape1 -// CHECK-NOT: tt.reshape -// CHECK: [[DIV:%.*]] = arith.divui %c1024_i64, %c4_i64 : i64 -// CHECK: [[MUL1:%.*]] = arith.muli %c1_i64, [[DIV]] : i64 -// CHECK: [[ADD1:%.*]] = arith.addi [[MUL1]], %c4_i64_0 : i64 -// CHECK: [[TRUNC:%.*]] = arith.trunci [[DIV]] : i64 to i32 -// CHECK: [[MUL2:%.*]] = arith.muli %c2_i32, [[TRUNC]] : i32 -// CHECK: [[ADD2:%.*]] = arith.addi [[MUL2]], %c1_i32 : i32 -// CHECK: [[PTR:%.*]] = tt.make_tensor_ptr %arg1, [[[ADD1]], %c1024_i64], [%c4_i64, %c1_i64], [[[ADD2]], %c0_i32] {order = array} : > -// CHECK: [[LOAD_B:%.*]] = tt.load [[PTR]] {boundaryCheck = array} : !tt.ptr> -// CHECK: tt.dot {{.*}}, [[LOAD_B]], {{.*}}, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32> - -// ----- - -// COM: tt.load -> tt.reshape -> tt.dot chain, in a loop. -// COM: where the 'make_tensor_ptr' result is not loop carried. -tt.func public @fuseLoadWithReshape2(%arg0: !tt.ptr>, %arg1: !tt.ptr) { - %c0_i32 = arith.constant 0 : i32 - %c32_i32 = arith.constant 32 : i32 - %c1024_i32 = arith.constant 1024 : i32 - %c32_i64 = arith.constant 32 : i64 - %c1_i64 = arith.constant 1 : i64 - %c512_i64 = arith.constant 512 : i64 - %c1024_i64 = arith.constant 1024 : i64 - %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32> - %0 = tt.make_tensor_ptr %arg1, [%c512_i64, %c1024_i64, %c32_i64], [%c1024_i64, %c1_i64, %c512_i64], [%c32_i32, %c32_i32, %c0_i32] {order = array} : > - %res:2 = scf.for %arg3 = %c0_i32 to %c1024_i32 step %c32_i32 iter_args(%arg4 = %cst, %arg5 = %c0_i32) -> (tensor<256x256xf32>, i32) : i32 { - %1 = tt.load %arg0 {boundaryCheck = array} : !tt.ptr> - %3 = tt.load %0 {boundaryCheck = array} : !tt.ptr> - %2 = tt.reshape %3 : tensor<1x256x32xbf16> -> tensor<256x32xbf16> - %4 = tt.dot %2, %1, %arg4, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32> - %5 = arith.addi %arg5, %c32_i32 : i32 - scf.yield %4, %5 : tensor<256x256xf32>, i32 - } - tt.return -} -// CHECK-LABEL: fuseLoadWithReshape2 -// CHECK-NOT: tt.reshape -// CHECK: [[DIV:%.*]] = arith.divui %c1024_i64, %c512_i64 : i64 -// CHECK: [[MUL1:%.*]] = arith.muli %c512_i64, [[DIV]] : i64 -// CHECK: [[ADD1:%.*]] = arith.addi [[MUL1]], %c32_i64 : i64 -// CHECK: [[TRUNC:%.*]] = arith.trunci [[DIV]] : i64 to i32 -// CHECK: [[MUL2:%.*]] = arith.muli %c32_i32, [[TRUNC]] : i32 -// CHECK: [[ADD2:%.*]] = arith.addi [[MUL2]], %c0_i32 : i32 -// CHECK: [[PTR:%.*]] = tt.make_tensor_ptr %arg1, [%c1024_i64, [[ADD1]]], [%c1_i64, %c512_i64], [%c32_i32, [[ADD2]]] {order = array} : > -// CHECK: scf.for -// CHECK: [[LOAD_A:%.*]] = tt.load [[PTR]] {boundaryCheck = array} : !tt.ptr> -// CHECK: tt.dot [[LOAD_A]], {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32> - -// ----- - -// COM: tt.load -> tt.reshape -> tt.dot chain, in a loop -// COM: Where the 'make_tensor_ptr' result is loop carried. -tt.func public @fuseLoadWithReshape3(%a_ptr: !tt.ptr {tt.divisibility = 16 : i32}, %b_ptr: !tt.ptr {tt.divisibility = 16 : i32}, %c_ptr: !tt.ptr {tt.divisibility = 16 : i32}, %M: i32 {tt.divisibility = 16 : i32}, %N: i32 {tt.divisibility = 16 : i32}, %K: i32 {tt.divisibility = 16 : i32}, %stride_am: i32 {tt.divisibility = 16 : i32}, %stride_bk: i32 {tt.divisibility = 16 : i32}, %stride_cm: i32 {tt.divisibility = 16 : i32}) { - %c127_i32 = arith.constant 127 : i32 - %c255_i32 = arith.constant 255 : i32 - %cst = arith.constant dense<0.000000e+00> : tensor<256x128xf32> - %c32_i32 = arith.constant 32 : i32 - %c128_i32 = arith.constant 128 : i32 - %c0_i32 = arith.constant 0 : i32 - %c1_i64 = arith.constant 1 : i64 - %c256_i32 = arith.constant 256 : i32 - %c4_i32 = arith.constant 4 : i32 - %0 = tt.get_program_id x : i32 - %1 = arith.addi %M, %c255_i32 : i32 - %2 = arith.divsi %1, %c256_i32 : i32 - %3 = arith.addi %N, %c127_i32 : i32 - %4 = arith.divsi %3, %c128_i32 : i32 - %5 = arith.muli %4, %c4_i32 : i32 - %6 = arith.divsi %0, %5 : i32 - %7 = arith.muli %6, %c4_i32 : i32 - %8 = arith.subi %2, %7 : i32 - %9 = arith.minsi %8, %c4_i32 : i32 - %10 = arith.remsi %0, %5 : i32 - %11 = arith.remsi %10, %9 : i32 - %12 = arith.addi %7, %11 : i32 - %13 = arith.divsi %10, %9 : i32 - %14 = arith.muli %12, %c256_i32 : i32 - %15 = arith.extsi %M : i32 to i64 - %16 = arith.extsi %K : i32 to i64 - %17 = arith.extsi %stride_am : i32 to i64 - %18 = tt.make_tensor_ptr %a_ptr, [%c1_i64, %15, %16], [%c1_i64, %17, %c1_i64], [%c0_i32, %c128_i32, %c0_i32] {order = array} : > - %19 = arith.muli %13, %c128_i32 : i32 - %20 = arith.extsi %N : i32 to i64 - %21 = arith.extsi %stride_bk : i32 to i64 - %22 = tt.make_tensor_ptr %b_ptr, [%16, %20], [%21, %c1_i64], [%c0_i32, %19] {order = array} : > - %accumulator:3 = scf.for %k = %c0_i32 to %K step %c32_i32 iter_args(%a_block_ptr = %18, %b_block_ptr = %22, %accumulator_0 = %cst) -> (!tt.ptr>, !tt.ptr>, tensor<256x128xf32>) : i32 { - %25 = tt.load %a_block_ptr {boundaryCheck = array} : !tt.ptr> - %26 = tt.reshape %25 : tensor<1x256x32xf32> -> tensor<256x32xf32> - %27 = tt.load %b_block_ptr {boundaryCheck = array} : !tt.ptr> - %28 = tt.dot %26, %27, %cst, inputPrecision = tf32 : tensor<256x32xf32> * tensor<32x128xf32> -> tensor<256x128xf32> - %29 = arith.addf %accumulator_0, %28 : tensor<256x128xf32> - %30 = tt.advance %a_block_ptr, [%c0_i32, %c0_i32, %c32_i32] : > - %31 = tt.advance %b_block_ptr, [%c32_i32, %c0_i32] : > - scf.yield %30, %31, %29 : !tt.ptr>, !tt.ptr>, tensor<256x128xf32> - } - %23 = arith.extsi %stride_cm : i32 to i64 - %24 = tt.make_tensor_ptr %c_ptr, [%15, %20], [%23, %c1_i64], [%14, %19] {order = array} : > - tt.store %24, %accumulator#2 {boundaryCheck = array} : !tt.ptr> - tt.return -} -// CHECK-LABEL: fuseLoadWithReshape3 -// CHECK-NOT: tt.reshape -// CHECK: [[EXT_M:%.*]] = arith.extsi %arg3 : i32 to i64 -// CHECK: [[DIV:%.*]] = arith.divui %c1_i64, %17 : i64 -// CHECK: [[MUL1:%.*]] = arith.muli %c1_i64, [[DIV]] : i64 -// CHECK: [[ADD1:%.*]] = arith.addi [[MUL1]], %15 : i64 -// CHECK: [[TRUNC:%.*]] = arith.trunci [[DIV]] : i64 to i32 -// CHECK: [[MUL2:%.*]] = arith.muli %c0_i32, [[TRUNC]] : i32 -// CHECK: [[ADD2:%.*]] = arith.addi [[MUL2]], %c128_i32 : i32 -// CHECK: [[PTR:%.*]] = tt.make_tensor_ptr %arg0, [[[ADD1]], %16], [%17, %c1_i64], [[[ADD2]], %c0_i32] {order = array} : > -// CHECK: scf.for {{.*}} = %c0_i32 to {{.*}} step %c32_i32 iter_args([[ARG:%.*]] = [[PTR]] -// CHECK: [[LOAD_A:%.*]] = tt.load [[ARG]] {boundaryCheck = array} : !tt.ptr> -// CHECK: tt.dot [[LOAD_A]], {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<256x32xf32> * tensor<32x128xf32> -> tensor<256x128xf32> -// CHECK: tt.advance [[ARG]], [%c0_i32, %c32_i32] : > - -// ----- - -// COM: tt.load -> tt.reshape -> tt.dot chain, in 2 loops. -// COM: Where the block ptr used by the loads in the 2 loops is created by the same make_tensor_ptr operation. -tt.func public @fuseLoadWithReshape4(%arg0: i32, %arg1: !tt.ptr, %arg2: !tt.ptr) { - %c0_i32 = arith.constant 0 : i32 - %c1_i32 = arith.constant 1 : i32 - %c2_i32 = arith.constant 2 : i32 - %c32_i32 = arith.constant 32 : i32 - %c1_i64 = arith.constant 1 : i64 - %c64_i64 = arith.constant 64 : i64 - %c256_i64 = arith.constant 256 : i64 - %cst = arith.constant dense<0.000000e+00> : tensor<64x64xf32> - %7 = tt.make_tensor_ptr %arg1, [%c1_i64, %c64_i64], [%c64_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > - %9 = tt.make_tensor_ptr %arg2, [%c1_i64, %c256_i64, %c64_i64], [%c256_i64, %c64_i64, %c1_i64], [%c0_i32, %c1_i32, %c2_i32] {order = array} : > - %10 = tt.advance %7, [%arg0, %c0_i32] : > - %11 = tt.load %10 {boundaryCheck = array} : !tt.ptr> - %res1:1 = scf.for %arg3 = %c0_i32 to %arg0 step %c32_i32 iter_args(%arg4 = %arg0) -> (i32) : i32 { - %adv = tt.advance %9, [%arg4, %c0_i32] : > - %load = tt.load %adv : !tt.ptr> - %reshape = tt.reshape %load : tensor<1x32x64xf16> -> tensor<32x64xf16> - %dot = tt.dot %11, %reshape, %cst, inputPrecision = tf32 : tensor<64x32xf16> * tensor<32x64xf16> -> tensor<64x64xf32> - %add = arith.addi %arg4, %c32_i32 : i32 - scf.yield %add : i32 - } - %res2:1 = scf.for %arg3 = %c0_i32 to %arg0 step %c32_i32 iter_args(%arg4 = %arg0) -> (i32) : i32 { - %adv = tt.advance %9, [%arg4, %c0_i32] : > - %load = tt.load %adv : !tt.ptr> - %reshape = tt.reshape %load : tensor<1x32x64xf16> -> tensor<32x64xf16> - %dot = tt.dot %11, %reshape, %cst, inputPrecision = tf32 : tensor<64x32xf16> * tensor<32x64xf16> -> tensor<64x64xf32> - %add = arith.addi %arg4, %c32_i32 : i32 - scf.yield %add : i32 - } - tt.return -} -// CHECK-LABEL: fuseLoadWithReshape4 -// CHECK-NOT: tt.reshape -// CHECK: [[DIV1:%.*]] = arith.divui %c256_i64, %c64_i64 : i64 -// CHECK: [[MUL11:%.*]] = arith.muli %c1_i64, [[DIV1]] : i64 -// CHECK: [[ADD11:%.*]] = arith.addi [[MUL11]], %c256_i64 : i64 -// CHECK: [[TRUNC1:%.*]] = arith.trunci [[DIV1]] : i64 to i32 -// CHECK: [[MUL21:%.*]] = arith.muli %c0_i32, [[TRUNC1]] : i32 -// CHECK: [[ADD21:%.*]] = arith.addi [[MUL21]], %c1_i32 : i32 -// CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr %arg2, [[[ADD11]], %c64_i64], [%c64_i64, %c1_i64], [[[ADD21]], %c2_i32] {order = array} : > -// CHECK: [[DIV2:%.*]] = arith.divui %c256_i64, %c64_i64 : i64 -// CHECK: [[MUL12:%.*]] = arith.muli %c1_i64, [[DIV2]] : i64 -// CHECK: [[ADD12:%.*]] = arith.addi [[MUL12]], %c256_i64 : i64 -// CHECK: [[TRUNC2:%.*]] = arith.trunci [[DIV2]] : i64 to i32 -// CHECK: [[MUL22:%.*]] = arith.muli %c0_i32, [[TRUNC2]] : i32 -// CHECK: [[ADD22:%.*]] = arith.addi [[MUL22]], %c1_i32 : i32 -// CHECK: [[PTR2:%.*]] = tt.make_tensor_ptr %arg2, [[[ADD12]], %c64_i64], [%c64_i64, %c1_i64], [[[ADD22]], %c2_i32] {order = array} : > -// CHECK: scf.for -// CHECK: [[ADV:%.*]] = tt.advance [[PTR2]], {{.*}} : > -// CHECK: [[LOAD_B1:%.*]] = tt.load [[ADV]] : !tt.ptr> -// CHECK: tt.dot {{.*}}, [[LOAD_B1]], {{.*}}, inputPrecision = tf32 : tensor<64x32xf16> * tensor<32x64xf16> -> tensor<64x64xf32> -// CHECK: scf.yield -// CHECK: scf.for -// CHECK: [[ADV:%.*]] = tt.advance [[PTR1]], {{.*}} : > -// CHECK: [[LOAD_B1:%.*]] = tt.load [[ADV]] : !tt.ptr> -// CHECK: tt.dot {{.*}}, [[LOAD_B1]], {{.*}}, inputPrecision = tf32 : tensor<64x32xf16> * tensor<32x64xf16> -> tensor<64x64xf32> -// CHECK: scf.yield diff --git a/test/TritonGEN/tritongen-2Dblockload-to-llvm.mlir b/test/TritonGEN/tritongen-2Dblockload-to-llvm.mlir deleted file mode 100644 index ab7df12178..0000000000 --- a/test/TritonGEN/tritongen-2Dblockload-to-llvm.mlir +++ /dev/null @@ -1,866 +0,0 @@ -// RUN: triton-opt -convert-tritongen-to-llvm -split-input-file %s | FileCheck %s - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v2i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=8, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<2xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ONE0:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[PTR:%.*]] = llvm.ptrtoint %arg0 : !llvm.ptr<1> to i64 - // CHECK: [[VAL_63:%.*]] = llvm.mlir.constant(-64 : i64) : i64 - // CHECK: [[VAL_64:%.*]] = llvm.and [[PTR]], [[VAL_63]] : i64 - // CHECK: [[VAL_65:%.*]] = llvm.inttoptr [[VAL_64]] : i64 to !llvm.ptr<1> - // CHECK: [[CL:%.*]] = llvm.mlir.constant(63 : i64) : i64 - // CHECK: [[AND:%.*]] = llvm.and [[PTR]], [[CL]] : i64 - // CHECK: [[TRUNC:%.*]] = llvm.trunc [[AND]] : i64 to i32 - // CHECK: [[ADD:%.*]] = llvm.add %arg1, [[TRUNC]] : i32 - // CHECK: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[SHR:%.*]] = llvm.udiv [[TRUNC]], [[ONE]] : i32 - // CHECK: [[X:%.*]] = llvm.add %arg4, [[SHR]] : i32 - // CHECK: [[BASE_ALIGNED:%.*]] = llvm.ptrtoint [[VAL_65]] : !llvm.ptr<1> to i64 - // CHECK: [[BASEWIDTH:%.*]] = llvm.sub [[ADD]], [[ONE0]] : i32 - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v4i16([[BASE_ALIGNED]], [[BASEWIDTH]], {{.*}}, [[X]], {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=8, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<4xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v8i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=8, tile_height=8, v_blocks=4, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v8i8({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi8> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v16i8({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi8> - llvm.return -} -} - -// ----- - -// CHECK: llvm.func spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv(i32, i32, i32, i32, !llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return} - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C32]] x i8 : (i32) -> !llvm.ptr - // CHECK-NEXT: [[PTRTOINT:%.*]] = llvm.ptrtoint %arg0 : !llvm.ptr<1> to i64 - // CHECK-NEXT: [[VAL_63:%.*]] = llvm.mlir.constant(-64 : i64) : i64 - // CHECK-NEXT: [[VAL_64:%.*]] = llvm.and [[PTRTOINT]], [[VAL_63]] : i64 - // CHECK-NEXT: [[BASE_ALIGNED:%.*]] = llvm.inttoptr [[VAL_64]] : i64 to !llvm.ptr<1> - // CHECK-NEXT: [[CL:%.*]] = llvm.mlir.constant(63 : i64) : i64 - // CHECK-NEXT: [[AND:%.*]] = llvm.and [[PTRTOINT]], [[CL]] : i64 - // CHECK-NEXT: [[TRUNC:%.*]] = llvm.trunc [[AND]] : i64 to i32 - // CHECK-NEXT: [[ADD_0:%.*]] = llvm.add %arg1, [[TRUNC]] : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[DIV:%.*]] = llvm.udiv [[TRUNC]], [[ONE]] : i32 - // CHECK-NEXT: [[ADD_1:%.*]] = llvm.add %arg4, [[DIV]] : i32 - // CHECK-DAG: [[ZERO_1:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> - // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement [[ADD_1]], [[UNDEF]][[[ZERO_1]] : i32] : vector<2xi32> - // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> - // CHECK-DAG: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-DAG: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: [[VBlocks:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], [[BASE_ALIGNED]], [[ADD_0]], %arg2, %arg3, [[COORD1]], [[DEST]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<32xi8> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=4, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<32xi8> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<8xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v32i8({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=8, tile_height=16, v_blocks=4, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<32xi8> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=16, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<32xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=16, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<32xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z41__spirv_Subgroup2DBlockLoadTransformINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<4xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=8, tile_height=32, v_blocks=1, transpose=false, vnni_transform=true, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<4xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z41__spirv_Subgroup2DBlockLoadTransformINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<8xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=32, v_blocks=1, transpose=false, vnni_transform=true, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z41__spirv_Subgroup2DBlockLoadTransformINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=32, v_blocks=2, transpose=false, vnni_transform=true, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z41__spirv_Subgroup2DBlockLoadTransformINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<32xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=32, v_blocks=4, transpose=false, vnni_transform=true, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<32xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<32xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=32, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<32xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<64xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=32, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<64xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v32i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=8, tile_height=16, v_blocks=4, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<32xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<8xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=16, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<32xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=32, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<32xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=32, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<4xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<4xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<8xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=16, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=16, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=16, tile_height=16, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: lvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=32, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<1xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=2, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<1xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<32xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=16, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<32xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<64xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=32, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<64xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=16, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=32, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<32xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z41__spirv_Subgroup2DBlockLoadTransformINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<8xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=16, v_blocks=1, transpose=false, vnni_transform=true, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z41__spirv_Subgroup2DBlockLoadTransformINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=32, v_blocks=1, transpose=false, vnni_transform=true, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z41__spirv_Subgroup2DBlockLoadTransformINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=16, v_blocks=2, transpose=false, vnni_transform=true, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z41__spirv_Subgroup2DBlockLoadTransformINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<32xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=32, v_blocks=2, transpose=false, vnni_transform=true, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<32xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(true) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v1i32({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=2, tile_height=8, v_blocks=1, transpose=true, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<1xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(true) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v2i32({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=4, tile_height=8, v_blocks=1, transpose=true, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<2xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(true) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v4i32({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, transpose=true, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<4xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z41__spirv_Subgroup2DBlockLoadTransposeINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=16, v_blocks=1, transpose=true, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=64, tile_width=4, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<4xi32> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: %[[ELEM_BITS:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[TILE_WIDTH:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: %[[TILE_HEIGHT:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: %[[VBLOCKS:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: %[[TRANSPOSE:.*]] = llvm.mlir.constant(false) : i1 - // CHECK: %[[VNNI:.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v16i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[ELEM_BITS]], %[[TILE_WIDTH]], %[[TILE_HEIGHT]], %[[VBLOCKS]], %[[TRANSPOSE]], %[[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=8, tile_height=8, v_blocks=4, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: %[[ELEM_BITS:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[TILE_WIDTH:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: %[[TILE_HEIGHT:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: %[[VBLOCKS:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[TRANSPOSE:.*]] = llvm.mlir.constant(false) : i1 - // CHECK: %[[VNNI:.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v4i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[ELEM_BITS]], %[[TILE_WIDTH]], %[[TILE_HEIGHT]], %[[VBLOCKS]], %[[TRANSPOSE]], %[[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=8, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<4xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: %[[ELEM_BITS:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: %[[TILE_WIDTH:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: %[[TILE_HEIGHT:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: %[[VBLOCKS:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: %[[TRANSPOSE:.*]] = llvm.mlir.constant(false) : i1 - // CHECK: %[[VNNI:.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v8i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[ELEM_BITS]], %[[TILE_WIDTH]], %[[TILE_HEIGHT]], %[[VBLOCKS]], %[[TRANSPOSE]], %[[VNNI]], {{.*}}) - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=8, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv( - // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Uncached, 4>, #triton_gen.load_cache_control<1, Uncached, 4>> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1UC_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv( - // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Uncached, 4>, #triton_gen.load_cache_control<1, Cached, 4>> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1UC_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv( - // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Cached, 4>, #triton_gen.load_cache_control<1, Uncached, 4>> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1C_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv( - // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Cached, 4>, #triton_gen.load_cache_control<1, Cached, 4>> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1C_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv( - // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Streaming, 4>, #triton_gen.load_cache_control<1, Uncached, 4>> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1S_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv( - // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Streaming, 4>, #triton_gen.load_cache_control<1, Cached, 4>> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1S_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv( - // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, InvalidateAfterRead, 4>, #triton_gen.load_cache_control<1, Cached, 4>> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1IAR_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv( - // CHECK-NOT: triton_gen.DecorationCacheControlINTEL - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> () - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=64, tile_width=8, tile_height=4, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<2xi64> - llvm.return -} -} diff --git a/test/TritonGEN/tritongen-2Dblockprefetch-to-llvm.mlir b/test/TritonGEN/tritongen-2Dblockprefetch-to-llvm.mlir deleted file mode 100644 index 45a357abd6..0000000000 --- a/test/TritonGEN/tritongen-2Dblockprefetch-to-llvm.mlir +++ /dev/null @@ -1,401 +0,0 @@ -// RUN: triton-opt -convert-tritongen-to-llvm -split-input-file %s | FileCheck %s - -// CHECK: llvm.func spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i(i32, i32, i32, i32, !llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects, no_unwind} - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.func @triton_gen.2Dblockprefetch(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK-NEXT: [[PTRTOINT:%.*]] = llvm.ptrtoint %arg0 : !llvm.ptr<1> to i64 - // CHECK-NEXT: [[VAL_63:%.*]] = llvm.mlir.constant(-64 : i64) : i64 - // CHECK-NEXT: [[VAL_64:%.*]] = llvm.and [[PTRTOINT]], [[VAL_63]] : i64 - // CHECK-NEXT: [[BASE_ALIGNED:%.*]] = llvm.inttoptr [[VAL_64]] : i64 to !llvm.ptr<1> - // CHECK-NEXT: [[CL:%.*]] = llvm.mlir.constant(63 : i64) : i64 - // CHECK-NEXT: [[AND:%.*]] = llvm.and [[PTRTOINT]], [[CL]] : i64 - // CHECK-NEXT: [[TRUNC:%.*]] = llvm.trunc [[AND]] : i64 to i32 - // CHECK-NEXT: [[ADD_0:%.*]] = llvm.add %arg1, [[TRUNC]] : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[DIV:%.*]] = llvm.udiv [[TRUNC]], [[ONE]] : i32 - // CHECK-NEXT: [[ADD_1:%.*]] = llvm.add %arg4, [[DIV]] : i32 - // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> - // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement [[ADD_1]], [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> - // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], [[BASE_ALIGNED]], [[ADD_0]], %arg2, %arg3, [[COORD1]]) - // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Uncached, 4>, #triton_gen.load_cache_control<1, Uncached, 4>> - // CHECK-SAME: : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, cache_control=L1UC_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=16, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=32, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ONE0:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[PTR:%.*]] = llvm.ptrtoint %arg0 : !llvm.ptr<1> to i64 - // CHECK: [[VAL_63:%.*]] = llvm.mlir.constant(-64 : i64) : i64 - // CHECK: [[VAL_64:%.*]] = llvm.and [[PTR]], [[VAL_63]] : i64 - // CHECK: [[VAL_65:%.*]] = llvm.inttoptr [[VAL_64]] : i64 to !llvm.ptr<1> - // CHECK: [[CL:%.*]] = llvm.mlir.constant(63 : i64) : i64 - // CHECK: [[AND:%.*]] = llvm.and [[PTR]], [[CL]] : i64 - // CHECK: [[TRUNC:%.*]] = llvm.trunc [[AND]] : i64 to i32 - // CHECK: [[ADD:%.*]] = llvm.add %arg1, [[TRUNC]] : i32 - // CHECK: [[TWO:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[SHR:%.*]] = llvm.udiv [[TRUNC]], [[TWO]] : i32 - // CHECK: [[X:%.*]] = llvm.add %arg4, [[SHR]] : i32 - // CHECK: [[BASE_ALIGNED:%.*]] = llvm.ptrtoint [[VAL_65]] : !llvm.ptr<1> to i64 - // CHECK: [[BASEWIDTH:%.*]] = llvm.sub [[ADD]], [[ONE0]] : i32 - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockPrefetch.isVoid([[BASE_ALIGNED]], [[BASEWIDTH]], {{.*}}, [[X]], {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=8, tile_height=1, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockPrefetch.isVoid({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=8, tile_height=2, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockPrefetch.isVoid({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=8, tile_height=4, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockPrefetch.isVoid({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=8, tile_height=16, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=16, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=32, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=16, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=16, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=16, tile_height=16, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=32, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=16, tile_height=32, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=16, v_blocks=2, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=32, v_blocks=2, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=8, v_blocks=2, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=16, v_blocks=2, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=32, v_blocks=2, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=2, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=16, v_blocks=2, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=32, v_blocks=2, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=32, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=32, v_blocks=2, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z36__spirv_Subgroup2DBlockPrefetchINTELiiiiPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=32, v_blocks=4, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - llvm.return -} diff --git a/test/TritonGEN/tritongen-2Dblockstore-to-llvm.mlir b/test/TritonGEN/tritongen-2Dblockstore-to-llvm.mlir deleted file mode 100644 index ab692e4625..0000000000 --- a/test/TritonGEN/tritongen-2Dblockstore-to-llvm.mlir +++ /dev/null @@ -1,210 +0,0 @@ -// RUN: triton-opt -convert-tritongen-to-llvm -split-input-file %s | FileCheck %s - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) { - // CHECK: [[ONE0:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[PTR:%.*]] = llvm.ptrtoint %arg0 : !llvm.ptr<1> to i64 - // CHECK: [[VAL_63:%.*]] = llvm.mlir.constant(-64 : i64) : i64 - // CHECK: [[VAL_64:%.*]] = llvm.and [[PTR]], [[VAL_63]] : i64 - // CHECK: [[VAL_65:%.*]] = llvm.inttoptr [[VAL_64]] : i64 to !llvm.ptr<1> - // CHECK: [[CL:%.*]] = llvm.mlir.constant(63 : i64) : i64 - // CHECK: [[AND:%.*]] = llvm.and [[PTR]], [[CL]] : i64 - // CHECK: [[TRUNC:%.*]] = llvm.trunc [[AND]] : i64 to i32 - // CHECK: [[ADD:%.*]] = llvm.add %arg1, [[TRUNC]] : i32 - // CHECK: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[SHR:%.*]] = llvm.udiv [[TRUNC]], [[ONE]] : i32 - // CHECK: [[X:%.*]] = llvm.add %arg4, [[SHR]] : i32 - // CHECK: [[BASE_ALIGNED:%.*]] = llvm.ptrtoint [[VAL_65]] : !llvm.ptr<1> to i64 - // CHECK: [[BASEWIDTH:%.*]] = llvm.sub [[ADD]], [[ONE0]] : i32 - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockWrite.v8i16([[BASE_ALIGNED]], [[BASEWIDTH]], {{.*}}, [[X]], {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=8, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi16>) - llvm.return -} -} - -// ----- - -// CHECK: llvm.func spir_funccc @_Z33__spirv_Subgroup2DBlockStoreINTELiiiiPvPU3AS1viiiDv2_i(i32, i32, i32, i32, !llvm.ptr {llvm.nonnull, llvm.readonly}, !llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>) attributes {no_unwind, will_return} - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi8>) { - // CHECK: llvm.func @triton_gen.2Dblockstore(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: vector<8xi8>) { - // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-NEXT: [[STOREVALPTR:%.*]] = llvm.alloca [[C8]] x i8 : (i32) -> !llvm.ptr - // CHECK-NEXT: llvm.store %arg6, [[STOREVALPTR]] : vector<8xi8>, !llvm.ptr - // CHECK-NEXT: [[PTRTOINT:%.*]] = llvm.ptrtoint %arg0 : !llvm.ptr<1> to i64 - // CHECK-NEXT: [[VAL_63:%.*]] = llvm.mlir.constant(-64 : i64) : i64 - // CHECK-NEXT: [[VAL_64:%.*]] = llvm.and [[PTRTOINT]], [[VAL_63]] : i64 - // CHECK-NEXT: [[BASE_ALIGNED:%.*]] = llvm.inttoptr [[VAL_64]] : i64 to !llvm.ptr<1> - // CHECK-NEXT: [[CL:%.*]] = llvm.mlir.constant(63 : i64) : i64 - // CHECK-NEXT: [[AND:%.*]] = llvm.and [[PTRTOINT]], [[CL]] : i64 - // CHECK-NEXT: [[TRUNC:%.*]] = llvm.trunc [[AND]] : i64 to i32 - // CHECK-NEXT: [[ADD_0:%.*]] = llvm.add %arg1, [[TRUNC]] : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: [[DIV:%.*]] = llvm.udiv [[TRUNC]], [[ONE]] : i32 - // CHECK-NEXT: [[ADD_1:%.*]] = llvm.add %arg4, [[DIV]] : i32 - // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> - // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement [[ADD_1]], [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> - // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> - // CHECK-DAG: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-DAG: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z33__spirv_Subgroup2DBlockStoreINTELiiiiPvPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], [[STOREVALPTR]], [[BASE_ALIGNED]], [[ADD_0]], %arg2, %arg3, [[COORD1]]) - // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.store_cache_control<0, Uncached, 5>, #triton_gen.store_cache_control<1, Uncached, 5>> - // CHECK-SAME: : (i32, i32, i32, i32, !llvm.ptr{{.*}}, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=1, cache_control=L1UC_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi8>) - llvm.return -} -} -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) { - // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[TileWidth:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-DAG: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z33__spirv_Subgroup2DBlockStoreINTELiiiiPvPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], [[DEST:%.*]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr{{.*}}, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi16>) - llvm.return -} -} - -// ----- - -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockWrite.v8i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=16, tile_width=32, tile_height=1, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi16>) - llvm.return -} - -// ----- - -// COM: threads-per-warp = 32 case. -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<1xi16>) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockWrite.v1i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=16, tile_width=16, tile_height=2, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<1xi16>) - llvm.return -} -// ----- - -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockWrite.v8i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=16, tile_width=8, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi16>) - llvm.return -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) { - // CHECK: llvm.mlir.constant(2 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-DAG: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-DAG: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z33__spirv_Subgroup2DBlockStoreINTELiiiiPvPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], [[DEST:%.*]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr{{.*}}, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=16, tile_width=16, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi16>) - llvm.return -} -} - -// ----- - -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockWrite.v8i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=16, tile_width=32, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi16>) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockWrite.v8i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=32, tile_width=4, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi16>) - llvm.return -} - -// ----- - -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) { - // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockWrite.v8i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi16>) - llvm.return -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi32>) { - // CHECK: llvm.mlir.constant(4 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-DAG: [[TileWidth:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-DAG: [[TileHeight:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z33__spirv_Subgroup2DBlockStoreINTELiiiiPvPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], [[DEST:%.*]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr{{.*}}, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=32, tile_width=16, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>) - llvm.return -} -} - -// ----- - -module attributes {"ttg.threads-per-warp" = 16 : i32} { -llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<2xi64>) { - // CHECK: llvm.mlir.constant(0 : i32) : i32 - // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: [[TileHeight:%.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-DAG: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-NEXT: llvm.call spir_funccc @_Z33__spirv_Subgroup2DBlockStoreINTELiiiiPvPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], [[DEST:%.*]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr{{.*}}, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits = 64, tile_width = 8, tile_height = 4, v_blocks = 1, cache_control = Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<2xi64>) - llvm.return -} -} diff --git a/third_party/intel/include/TritonGENToLLVM/TritonGENToLLVMPass.h b/third_party/intel/include/TritonGENToLLVM/TritonGENToLLVMPass.h index 6b06feff40..065d9814b1 100644 --- a/third_party/intel/include/TritonGENToLLVM/TritonGENToLLVMPass.h +++ b/third_party/intel/include/TritonGENToLLVM/TritonGENToLLVMPass.h @@ -21,8 +21,13 @@ namespace triton { #define GEN_PASS_DECL #include "intel/include/TritonGENToLLVM/Passes.h.inc" -void populateTritonGENToLLVMConversionPatterns(LLVMTypeConverter &converter, - RewritePatternSet &patterns); +namespace gpu::intel { +class LibCallEmitter; +} // namespace gpu::intel + +void populateTritonGENToLLVMConversionPatterns( + LLVMTypeConverter &converter, RewritePatternSet &patterns, + const mlir::triton::gpu::intel::LibCallEmitter &emitter); void registerConvertTritonGENToLLVMInterface(DialectRegistry ®istry); diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp index cf25e8ba28..1313eb190c 100644 --- a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp +++ b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp @@ -8,6 +8,7 @@ #include "Attributes.h" #include "Utils/LLVMIntr.h" +#include "Utils/LibCallEmitter.h" #include "Utils/Mangling.h" #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" @@ -35,6 +36,7 @@ #include "llvm/Support/ErrorHandling.h" #include "triton/Conversion/TritonGPUToLLVM/Utility.h" +#include "triton/Tools/Sys/GetEnv.hpp" #include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h" #include "intel/include/TritonGENToLLVM/TritonGENToLLVMPass.h" @@ -508,6 +510,131 @@ createGenISA2DBlockPrefetch(TritonGEN::Matrix2DBlockPrefetchOp op, intel::noUnwindWillReturnAttrs); } +static void +createAssertNot(ConversionPatternRewriter &rewriter, + const mlir::triton::gpu::intel::LibCallEmitter &emitter, + Value condition, StringRef message) { + + auto *ctx = rewriter.getContext(); + auto loc = rewriter.getInsertionPoint() != rewriter.getBlock()->end() + ? rewriter.getInsertionPoint()->getLoc() + : UnknownLoc::get(ctx); + + StringRef file = "unknown"; + StringRef func = "unknown"; + int line = 0; + + while (auto callLoc = dyn_cast(loc)) + loc = callLoc.getCallee(); + + while (auto nameLoc = dyn_cast(loc)) + loc = nameLoc.getChildLoc(); + + if (auto fileLineColLoc = dyn_cast(loc)) { + file = fileLineColLoc.getFilename(); + line = fileLineColLoc.getLine(); + } + + Block *prevBlock = rewriter.getBlock(); + auto insertPt = rewriter.getInsertionPoint(); + + Block *thenBlock = rewriter.splitBlock(prevBlock, insertPt); + + Block *ifBlock = rewriter.createBlock(prevBlock->getParent()); + rewriter.setInsertionPointToStart(ifBlock); + emitter.assertFail(rewriter, loc, message, file, func, line); + rewriter.create(loc, thenBlock); + + rewriter.setInsertionPointToEnd(prevBlock); + rewriter.create(loc, condition, ifBlock, thenBlock); + + rewriter.setInsertionPointToStart(thenBlock); +} + +static void create2DBlockAssertsImpl( + const mlir::Value &baseWidth, const mlir::Value &baseHeight, + const mlir::Value &basePitch, const mlir::Value &x, unsigned int elemSize, + const mlir::Location &loc, mlir::ConversionPatternRewriter &rewriter, + const mlir::triton::gpu::intel::LibCallEmitter &emitter) { + using namespace mlir; + using namespace mlir::LLVM; + + auto b = TritonLLVMOpBuilder(loc, rewriter); + + Value c0 = b.i32_val(0); + Value c4 = b.i32_val(4); + Value c64 = b.i32_val(64); + Value c16 = b.i32_val(16); + Value c24m1 = b.i32_val((1u << 24) - 1); + Value cElemSize = b.i32_val(elemSize); + Value cMaxAlign = b.i32_val(std::max(4u, elemSize)); + + Value wTooLarge = + rewriter.create(loc, ICmpPredicate::ugt, baseWidth, c24m1); + createAssertNot(rewriter, emitter, wTooLarge, + "2nd operand (base width) should be <= 24 bits"); + + Value wTooSmall = + rewriter.create(loc, ICmpPredicate::ult, baseWidth, c64); + createAssertNot(rewriter, emitter, wTooSmall, + "2nd operand (base width) should be >= 64"); + + Value wRem = rewriter.create(loc, baseWidth, cMaxAlign); + Value wNotAligned = rewriter.create(loc, ICmpPredicate::ne, wRem, c0); + createAssertNot( + rewriter, emitter, wNotAligned, + "2nd operand (base width) should be aligned to MAX(4, element_size)"); + + Value hTooLarge = + rewriter.create(loc, ICmpPredicate::ugt, baseHeight, c24m1); + createAssertNot(rewriter, emitter, hTooLarge, + "3rd operand (base height) should be <= 24 bits"); + + Value pTooLarge = + rewriter.create(loc, ICmpPredicate::ugt, basePitch, c24m1); + createAssertNot(rewriter, emitter, pTooLarge, + "4th operand (base pitch) should be <= 24 bits"); + + Value pTooSmall = + rewriter.create(loc, ICmpPredicate::ult, basePitch, c64); + createAssertNot(rewriter, emitter, pTooSmall, + "4th operand (base pitch) should be >= 64"); + + Value pRem = rewriter.create(loc, basePitch, c16); + Value pNotAligned = rewriter.create(loc, ICmpPredicate::ne, pRem, c0); + createAssertNot(rewriter, emitter, pNotAligned, + "4th operand (base pitch) should be a multiple of 16 bytes"); + + Value pLessThanWidth = + rewriter.create(loc, ICmpPredicate::ult, basePitch, baseWidth); + createAssertNot( + rewriter, emitter, pLessThanWidth, + "4th operand (base pitch) should be >= 2nd operand (base width)"); + + Value offsetBytes = rewriter.create(loc, x, cElemSize); + Value offsetRem = rewriter.create(loc, offsetBytes, c4); + Value badOffset = + rewriter.create(loc, ICmpPredicate::ne, offsetRem, c0); + createAssertNot( + rewriter, emitter, badOffset, + "5th operand (x) should be properly aligned for the element size"); +} + +template +static void +create2DBlockAsserts(OpTy op, mlir::ConversionPatternRewriter &rewriter, + const mlir::triton::gpu::intel::LibCallEmitter &emitter) { + + // if (!triton::tools::getBoolEnv("TRITON_INTEL_2DBLOCK_ASSERT")) { + // return; + // } + + // put implementation in a separate function to avoid template bloat + create2DBlockAssertsImpl( + op.getBaseWidth(), op.getBaseHeight(), op.getBasePitch(), op.getX(), + op.getElemSizeInBits() / 8, op->getLoc(), rewriter, emitter); +} + namespace { //===----------------------------------------------------------------------===// @@ -638,9 +765,17 @@ struct TritonMatrix2DBlockLoadLowering using ConvertOpToLLVMPattern< TritonGEN::Matrix2DBlockLoadOp>::ConvertOpToLLVMPattern; + explicit TritonMatrix2DBlockLoadLowering( + LLVMTypeConverter &typeConverter, + const mlir::triton::gpu::intel::LibCallEmitter &emitter) + : ConvertOpToLLVMPattern(typeConverter), + emitter(emitter) {} + LogicalResult matchAndRewrite(TritonGEN::Matrix2DBlockLoadOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { + create2DBlockAsserts(op, rewriter, emitter); + if (!isSPVBuiltinAvailable(op)) { // Fallback to GenISA interface. rewriter.replaceOp(op, createGenISA2DBlockRead(op, rewriter)); @@ -706,6 +841,9 @@ struct TritonMatrix2DBlockLoadLowering rewriter.replaceOp(op, rewriter.create(loc, resType, dest)); return success(); } + +private: + const mlir::triton::gpu::intel::LibCallEmitter &emitter; }; struct TritonMatrix2DBlockStoreLowering @@ -713,9 +851,17 @@ struct TritonMatrix2DBlockStoreLowering using ConvertOpToLLVMPattern< TritonGEN::Matrix2DBlockStoreOp>::ConvertOpToLLVMPattern; + explicit TritonMatrix2DBlockStoreLowering( + LLVMTypeConverter &typeConverter, + const mlir::triton::gpu::intel::LibCallEmitter &emitter) + : ConvertOpToLLVMPattern(typeConverter), + emitter(emitter) {} + LogicalResult matchAndRewrite(TritonGEN::Matrix2DBlockStoreOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { + create2DBlockAsserts(op, rewriter, emitter); + if (!isSPVBuiltinAvailable(op)) { // Fallback to GenISA interface. rewriter.replaceOp(op, createGenISA2DBlockWrite(op, rewriter)); @@ -780,6 +926,9 @@ struct TritonMatrix2DBlockStoreLowering rewriter.replaceOp(op, call); return success(); } + +protected: + const mlir::triton::gpu::intel::LibCallEmitter &emitter; }; struct TritonMatrix2DBlockPrefetchLowering @@ -787,9 +936,18 @@ struct TritonMatrix2DBlockPrefetchLowering using ConvertOpToLLVMPattern< TritonGEN::Matrix2DBlockPrefetchOp>::ConvertOpToLLVMPattern; + explicit TritonMatrix2DBlockPrefetchLowering( + LLVMTypeConverter &typeConverter, + const mlir::triton::gpu::intel::LibCallEmitter &emitter) + : ConvertOpToLLVMPattern( + typeConverter), + emitter(emitter) {} + LogicalResult matchAndRewrite(TritonGEN::Matrix2DBlockPrefetchOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { + create2DBlockAsserts(op, rewriter, emitter); + if (!isSPVBuiltinAvailable(op)) { // Fallback to GenISA interface. rewriter.replaceOp(op, createGenISA2DBlockPrefetch(op, rewriter)); @@ -846,6 +1004,9 @@ struct TritonMatrix2DBlockPrefetchLowering rewriter.replaceOp(op, call); return success(); } + +private: + const mlir::triton::gpu::intel::LibCallEmitter &emitter; }; template loadDialect(); - } - - /// Hook for derived dialect interface to provide conversion patterns - /// and mark dialect legal for the conversion target. - void populateConvertToLLVMConversionPatterns( - ConversionTarget &target, LLVMTypeConverter &typeConverter, - RewritePatternSet &patterns) const final { - populateTritonGENToLLVMConversionPatterns(typeConverter, patterns); - } -}; - -} // namespace - //===----------------------------------------------------------------------===// // Pattern Population and Registration //===----------------------------------------------------------------------===// void mlir::triton::populateTritonGENToLLVMConversionPatterns( - LLVMTypeConverter &converter, RewritePatternSet &patterns) { + LLVMTypeConverter &converter, RewritePatternSet &patterns, + const mlir::triton::gpu::intel::LibCallEmitter &emitter) { patterns - .add(converter); -} + .add(converter, emitter); -void registerConvertTritonTritonGENToLLVMInterface(DialectRegistry ®istry) { - registry.addExtension( - +[](MLIRContext *ctx, TritonGEN::TritonGENDialect *dialect) { - dialect->addInterfaces(); - }); + patterns.add( + converter); } diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp index 788d746f85..3a466f9874 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp @@ -150,143 +150,22 @@ std::string TargetInfo::getMulhiFuncName(Type resultElementTy) const { return funcName; } -Value printfPromoteValue(RewriterBase &rewriter, Value value, bool isSigned) { - auto type = value.getType(); - if (isa(type) && type.getIntOrFloatBitWidth() == 1) { - // FIXME: There is some problem when using i1 type now, - // remove this code once IGC fix the problem. - TritonLLVMOpBuilder b(rewriter.getUnknownLoc(), rewriter); - return b.zext(i8_ty, value); - } else if (type.isIntOrIndex() && type.getIntOrFloatBitWidth() < 32) { - TritonLLVMOpBuilder b(rewriter.getUnknownLoc(), rewriter); - if (isSigned) { - return b.sext(i32_ty, value); - } else { - return b.zext(i32_ty, value); - } - } else { - return value; - } -} - -// declare __spirv_ocl_printf(i8*, ...) as external function -static LLVM::LLVMFuncOp getSpirvPrintfDeclaration(RewriterBase &rewriter) { - auto moduleOp = rewriter.getBlock()->getParent()->getParentOfType(); - StringRef funcName("_Z18__spirv_ocl_printf"); - Operation *funcOp = moduleOp.lookupSymbol(funcName); - if (funcOp) - return cast(*funcOp); - - MLIRContext *context = rewriter.getContext(); - auto ptrTy = LLVM::LLVMPointerType::get( - context, TritonGEN::TritonGENMemorySpace::kUniformConstant); - SmallVector argsType{ptrTy}; - auto retType = i32_ty; - auto funcType = - LLVM::LLVMFunctionType::get(retType, argsType, /*isVarArg*/ true); - - ConversionPatternRewriter::InsertionGuard guard(rewriter); - rewriter.setInsertionPointToStart(moduleOp.getBody()); - - auto printFunc = rewriter.create( - UnknownLoc::get(context), funcName, funcType, LLVM::Linkage::External, - /*dsoLocal*/ false, LLVM::CConv::SPIR_FUNC, /*comdat=*/SymbolRefAttr{}); - printFunc->setAttr("nounwind", rewriter.getUnitAttr()); - - return printFunc; -} - void TargetInfo::printf(RewriterBase &rewriter, Value formatStrStart, int /*formatStrByteCount*/, ValueRange args, ArrayRef isSigned) const { - auto *ctx = rewriter.getContext(); - Type ptr = ptr_ty(ctx); - auto moduleOp = rewriter.getBlock()->getParent()->getParentOfType(); - auto funcOp = getSpirvPrintfDeclaration(rewriter); - auto loc = UnknownLoc::get(ctx); - auto b = TritonLLVMOpBuilder(loc, rewriter); - - SmallVector operands; - operands.push_back(formatStrStart); - for (auto [i, arg] : llvm::enumerate(args)) { - operands.push_back(printfPromoteValue( - rewriter, arg, isSigned.empty() ? true : isSigned[i])); - } - auto callOp = b.call(funcOp, operands); - callOp.setCConv(triton::gpu::intel::getRequiredCConv(callOp)); + emitter.printf(rewriter, formatStrStart, /*formatStrByteCount*/ 0, args, + isSigned); } void TargetInfo::printf(RewriterBase &rewriter, StringRef msg, ValueRange args, ArrayRef isSigned) const { - assert(!msg.empty() && "printf with empty string not supported"); - llvm::SmallString<64> msgNewline(msg); - msgNewline.push_back('\n'); - msgNewline.push_back('\0'); - Value msgValue = getGlobalStringStart( - rewriter.getUnknownLoc(), rewriter, "printfFormat_", msgNewline, - /*addressSpace=*/TritonGEN::kUniformConstant); - printf(rewriter, msgValue, msgNewline.size_in_bytes(), args, isSigned); -} - -static LLVM::LLVMFuncOp getAssertfailDeclaration(RewriterBase &rewriter) { - auto moduleOp = rewriter.getBlock()->getParent()->getParentOfType(); - StringRef funcName = "__assert_fail"; - Operation *funcOp = moduleOp.lookupSymbol(funcName); - if (funcOp) - return cast(*funcOp); - - // void __assert_fail(const char * assertion, const char * file, unsigned - // int line, const char * function); - auto *ctx = rewriter.getContext(); - SmallVector argsType; - argsType = {ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric), - ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric), i32_ty, - ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric)}; - auto funcType = LLVM::LLVMFunctionType::get(void_ty(ctx), argsType); - - RewriterBase::InsertionGuard guard(rewriter); - rewriter.setInsertionPointToStart(moduleOp.getBody()); - - auto func = rewriter.create(UnknownLoc::get(ctx), funcName, - funcType); - func.setCConv(LLVM::cconv::CConv::SPIR_FUNC); - return func; + emitter.printf(rewriter, msg, args, isSigned); } void TargetInfo::assertFail(RewriterBase &rewriter, Location loc, StringRef message, StringRef file, StringRef func, int line) const { - auto b = TritonLLVMOpBuilder(loc, rewriter); - auto funcOp = getAssertfailDeclaration(rewriter); - auto moduleOp = rewriter.getBlock()->getParent()->getParentOfType(); - unsigned addrSpace = TritonGEN::TritonGENMemorySpace::kCrossWorkgroup; - llvm::SmallString<64> messageString(message), fileString(file), - funcString(func); - messageString.push_back('\0'); - fileString.push_back('\0'); - funcString.push_back('\0'); - Value messageStringVal = - getGlobalStringStart(loc, rewriter, "assertMessage_", messageString, - /*addressSpace=*/TritonGEN::kCrossWorkgroup); - Value fileStringVal = - getGlobalStringStart(loc, rewriter, "assertFile_", fileString, - /*addressSpace=*/TritonGEN::kCrossWorkgroup); - Value funcStringVal = - getGlobalStringStart(loc, rewriter, "assertFunc_", funcString, - /*addressSpace=*/TritonGEN::kCrossWorkgroup); - Value lineNumber = b.i32_val(line); - - auto *ctx = rewriter.getContext(); - SmallVector operands; - Value messageStringPtr = b.addrspacecast( - ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric), messageStringVal); - Value fileStringPtr = b.addrspacecast( - ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric), fileStringVal); - Value funcStringPtr = b.addrspacecast( - ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric), funcStringVal); - operands = {messageStringPtr, fileStringPtr, lineNumber, funcStringPtr}; - auto ret = b.call(funcOp, operands); - ret.setCConv(LLVM::cconv::CConv::SPIR_FUNC); + return emitter.assertFail(rewriter, loc, message, file, func, line); } int TargetInfo::getSharedAddressSpace() const { @@ -312,47 +191,7 @@ int TargetInfo::getAddressSpace(Attribute addressSpace) const { Value TargetInfo::getGlobalStringStart(Location loc, RewriterBase &rewriter, StringRef name, StringRef value, unsigned addressSpace) const { - auto b = TritonLLVMOpBuilder(loc, rewriter); - LLVM::GlobalOp global = - getGlobalString(loc, rewriter, name, value, addressSpace); - MLIRContext *ctx = rewriter.getContext(); - Type globalPtrType = ptr_ty(ctx, addressSpace); - Value globalPtr = rewriter.create(loc, global); - return b.gep(globalPtrType, i8_ty, globalPtr, LLVM::GEPArg{0}); -} - -LLVM::GlobalOp TargetInfo::getGlobalString(Location loc, RewriterBase &rewriter, - StringRef name, StringRef value, - unsigned addressSpace) const { - StringAttr valueAttr = rewriter.getStringAttr(value); - std::pair cacheKey{addressSpace, valueAttr}; - auto pos = globals.find(cacheKey); - if (pos != globals.end()) - return pos->second; - - ModuleOp moduleOp = rewriter.getInsertionPoint()->getParentOfType(); - - llvm::SmallString<64> contentStr(value); - size_t contentSize = contentStr.size_in_bytes(); - auto globalType = LLVM::LLVMArrayType::get(i8_ty, contentSize); - - auto createGlobal = [&](StringRef name) { - RewriterBase::InsertionGuard guard(rewriter); - rewriter.setInsertionPointToStart(moduleOp.getBody()); - return rewriter.create( - rewriter.getUnknownLoc(), globalType, - /*isConstant=*/true, LLVM::Linkage::Internal, name, valueAttr, - /*alignment=*/0, addressSpace); - }; - - LLVM::GlobalOp global = - moduleOp.lookupSymbol(name) - ? createGlobal(Twine{name}.concat(Twine{globals.size()}).str()) - : createGlobal(name); - - globals.try_emplace(cacheKey, global); - - return global; + return emitter.getGlobalStringStart(loc, rewriter, name, value, addressSpace); } std::unique_ptr createTargetInfo(ModuleOp mod) { diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.h b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.h index f63c55c7f2..414e59ce12 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.h @@ -9,6 +9,7 @@ #ifndef TRITON_CONVERSION_TRITONGPU_TO_LLVM_TARGETINFOINTEL_H #define TRITON_CONVERSION_TRITONGPU_TO_LLVM_TARGETINFOINTEL_H +#include "Utils/LibCallEmitter.h" #include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h" #include @@ -87,8 +88,7 @@ class TargetInfo : public mlir::triton::TargetInfoBase { StringRef name, StringRef value, unsigned addressSpace) const; - mutable llvm::DenseMap, LLVM::GlobalOp> - globals; + const mlir::triton::gpu::intel::LibCallEmitter emitter; }; std::unique_ptr createTargetInfo(ModuleOp mod); diff --git a/third_party/intel/lib/Utils/CMakeLists.txt b/third_party/intel/lib/Utils/CMakeLists.txt index 0731c57759..da651add27 100644 --- a/third_party/intel/lib/Utils/CMakeLists.txt +++ b/third_party/intel/lib/Utils/CMakeLists.txt @@ -1,5 +1,6 @@ add_triton_library(TritonIntelUtils DefUseChain.cpp + LibCallEmitter.cpp LLVMIntr.cpp Mangling.cpp Utility.cpp diff --git a/third_party/intel/lib/Utils/LibCallEmitter.cpp b/third_party/intel/lib/Utils/LibCallEmitter.cpp new file mode 100644 index 0000000000..35297bc26f --- /dev/null +++ b/third_party/intel/lib/Utils/LibCallEmitter.cpp @@ -0,0 +1,206 @@ +#include "LibCallEmitter.h" + +#include "Dialect/TritonIntelGPU/IR/Utils.h" + +#include "triton/Conversion/TritonGPUToLLVM/Utility.h" + +#include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/IR/BuiltinOps.h" + +using namespace mlir; + +namespace mlir::triton::gpu::intel { + +static Value printfPromoteValue(RewriterBase &rewriter, Value value, + bool isSigned) { + auto type = value.getType(); + if (isa(type) && type.getIntOrFloatBitWidth() == 1) { + // FIXME: There is some problem when using i1 type now, + // remove this code once IGC fix the problem. + TritonLLVMOpBuilder b(rewriter.getUnknownLoc(), rewriter); + return b.zext(i8_ty, value); + } else if (type.isIntOrIndex() && type.getIntOrFloatBitWidth() < 32) { + TritonLLVMOpBuilder b(rewriter.getUnknownLoc(), rewriter); + if (isSigned) { + return b.sext(i32_ty, value); + } else { + return b.zext(i32_ty, value); + } + } else { + return value; + } +} + +// declare __spirv_ocl_printf(i8*, ...) as external function +static LLVM::LLVMFuncOp getSpirvPrintfDeclaration(RewriterBase &rewriter) { + auto moduleOp = rewriter.getBlock()->getParent()->getParentOfType(); + StringRef funcName("_Z18__spirv_ocl_printf"); + Operation *funcOp = moduleOp.lookupSymbol(funcName); + if (funcOp) + return cast(*funcOp); + + MLIRContext *context = rewriter.getContext(); + auto ptrTy = LLVM::LLVMPointerType::get( + context, TritonGEN::TritonGENMemorySpace::kUniformConstant); + SmallVector argsType{ptrTy}; + auto retType = i32_ty; + auto funcType = + LLVM::LLVMFunctionType::get(retType, argsType, /*isVarArg*/ true); + + ConversionPatternRewriter::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(moduleOp.getBody()); + + auto printFunc = rewriter.create( + UnknownLoc::get(context), funcName, funcType, LLVM::Linkage::External, + /*dsoLocal*/ false, LLVM::CConv::SPIR_FUNC, /*comdat=*/SymbolRefAttr{}); + printFunc->setAttr("nounwind", rewriter.getUnitAttr()); + + return printFunc; +} + +static LLVM::LLVMFuncOp getAssertfailDeclaration(RewriterBase &rewriter) { + auto moduleOp = rewriter.getBlock()->getParent()->getParentOfType(); + StringRef funcName = "__assert_fail"; + Operation *funcOp = moduleOp.lookupSymbol(funcName); + if (funcOp) + return cast(*funcOp); + + // void __assert_fail(const char * assertion, const char * file, unsigned + // int line, const char * function); + auto *ctx = rewriter.getContext(); + SmallVector argsType; + argsType = {ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric), + ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric), i32_ty, + ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric)}; + auto funcType = LLVM::LLVMFunctionType::get(void_ty(ctx), argsType); + + RewriterBase::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(moduleOp.getBody()); + + auto func = rewriter.create(UnknownLoc::get(ctx), funcName, + funcType); + func.setCConv(LLVM::cconv::CConv::SPIR_FUNC); + return func; +} + +Value LibCallEmitter::getGlobalStringStart(Location loc, RewriterBase &rewriter, + StringRef name, StringRef value, + unsigned addressSpace) const { + auto b = TritonLLVMOpBuilder(loc, rewriter); + LLVM::GlobalOp global = + getGlobalString(loc, rewriter, name, value, addressSpace); + MLIRContext *ctx = rewriter.getContext(); + Type globalPtrType = ptr_ty(ctx, addressSpace); + Value globalPtr = rewriter.create(loc, global); + return b.gep(globalPtrType, i8_ty, globalPtr, LLVM::GEPArg{0}); +} + +LLVM::GlobalOp LibCallEmitter::getGlobalString(Location loc, + RewriterBase &rewriter, + StringRef name, StringRef value, + unsigned addressSpace) const { + StringAttr valueAttr = rewriter.getStringAttr(value); + std::pair cacheKey{addressSpace, valueAttr}; + auto pos = globals.find(cacheKey); + if (pos != globals.end()) + return pos->second; + + ModuleOp moduleOp = + rewriter.getBlock()->getParent()->getParentOfType(); + + llvm::SmallString<64> contentStr(value); + size_t contentSize = contentStr.size_in_bytes(); + auto globalType = LLVM::LLVMArrayType::get(i8_ty, contentSize); + + auto createGlobal = [&](StringRef name) { + RewriterBase::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(moduleOp.getBody()); + return rewriter.create( + rewriter.getUnknownLoc(), globalType, + /*isConstant=*/true, LLVM::Linkage::Internal, name, valueAttr, + /*alignment=*/0, addressSpace); + }; + + LLVM::GlobalOp global = + moduleOp.lookupSymbol(name) + ? createGlobal(Twine{name}.concat(Twine{globals.size()}).str()) + : createGlobal(name); + + globals.try_emplace(cacheKey, global); + + return global; +} + +//===----------------------------------------------------------------------===// +// Public API +//===----------------------------------------------------------------------===// + +void LibCallEmitter::printf(RewriterBase &rewriter, Value formatStrStart, + int /*formatStrByteCount*/, ValueRange args, + ArrayRef isSigned) const { + auto *ctx = rewriter.getContext(); + Type ptr = ptr_ty(ctx); + auto moduleOp = rewriter.getBlock()->getParent()->getParentOfType(); + auto funcOp = getSpirvPrintfDeclaration(rewriter); + auto loc = UnknownLoc::get(ctx); + auto b = TritonLLVMOpBuilder(loc, rewriter); + + SmallVector operands; + operands.push_back(formatStrStart); + for (auto [i, arg] : llvm::enumerate(args)) { + operands.push_back(printfPromoteValue( + rewriter, arg, isSigned.empty() ? true : isSigned[i])); + } + auto callOp = b.call(funcOp, operands); + callOp.setCConv(triton::gpu::intel::getRequiredCConv(callOp)); +} + +void LibCallEmitter::printf(RewriterBase &rewriter, StringRef msg, + ValueRange args, ArrayRef isSigned) const { + assert(!msg.empty() && "printf with empty string not supported"); + llvm::SmallString<64> msgNewline(msg); + msgNewline.push_back('\n'); + msgNewline.push_back('\0'); + Value msgValue = getGlobalStringStart( + rewriter.getUnknownLoc(), rewriter, "printfFormat_", msgNewline, + /*addressSpace=*/TritonGEN::kUniformConstant); + printf(rewriter, msgValue, msgNewline.size_in_bytes(), args, isSigned); +} + +void LibCallEmitter::assertFail(RewriterBase &rewriter, Location loc, + StringRef message, StringRef file, + StringRef func, int line) const { + auto b = TritonLLVMOpBuilder(loc, rewriter); + auto funcOp = getAssertfailDeclaration(rewriter); + auto moduleOp = rewriter.getBlock()->getParent()->getParentOfType(); + unsigned addrSpace = TritonGEN::TritonGENMemorySpace::kCrossWorkgroup; + llvm::SmallString<64> messageString(message), fileString(file), + funcString(func); + messageString.push_back('\0'); + fileString.push_back('\0'); + funcString.push_back('\0'); + Value messageStringVal = + getGlobalStringStart(loc, rewriter, "assertMessage_", messageString, + /*addressSpace=*/TritonGEN::kCrossWorkgroup); + Value fileStringVal = + getGlobalStringStart(loc, rewriter, "assertFile_", fileString, + /*addressSpace=*/TritonGEN::kCrossWorkgroup); + Value funcStringVal = + getGlobalStringStart(loc, rewriter, "assertFunc_", funcString, + /*addressSpace=*/TritonGEN::kCrossWorkgroup); + Value lineNumber = b.i32_val(line); + + auto *ctx = rewriter.getContext(); + SmallVector operands; + Value messageStringPtr = b.addrspacecast( + ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric), messageStringVal); + Value fileStringPtr = b.addrspacecast( + ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric), fileStringVal); + Value funcStringPtr = b.addrspacecast( + ptr_ty(ctx, TritonGEN::TritonGENMemorySpace::kGeneric), funcStringVal); + operands = {messageStringPtr, fileStringPtr, lineNumber, funcStringPtr}; + auto ret = b.call(funcOp, operands); + ret.setCConv(LLVM::cconv::CConv::SPIR_FUNC); +} + +} // namespace mlir::triton::gpu::intel diff --git a/third_party/intel/lib/Utils/LibCallEmitter.h b/third_party/intel/lib/Utils/LibCallEmitter.h new file mode 100644 index 0000000000..8ee732916f --- /dev/null +++ b/third_party/intel/lib/Utils/LibCallEmitter.h @@ -0,0 +1,45 @@ +//===- LibCallEmitter.h - Emit library calls for Intel backend --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef TRITON_INTEL_UTILS_LIBCALLEMITTER_H +#define TRITON_INTEL_UTILS_LIBCALLEMITTER_H + +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" + +namespace mlir::triton::gpu::intel { + +class LibCallEmitter { +public: + LibCallEmitter() = default; + + void printf(RewriterBase &rewriter, Value formatStrStart, + int formatStrByteCount, ValueRange args, + ArrayRef isSigned = {}) const; + + void printf(RewriterBase &rewriter, StringRef msg, ValueRange args, + ArrayRef isSigned = {}) const; + + void assertFail(RewriterBase &rewriter, Location loc, StringRef message, + StringRef file, StringRef func, int line) const; + + Value getGlobalStringStart(Location loc, RewriterBase &rewriter, + StringRef name, StringRef value, + unsigned addressSpace) const; + +private: + LLVM::GlobalOp getGlobalString(Location loc, RewriterBase &rewriter, + StringRef name, StringRef value, + unsigned addressSpace) const; + + mutable llvm::DenseMap, LLVM::GlobalOp> + globals; +}; + +} // namespace mlir::triton::gpu::intel + +#endif // TRITON_INTEL_UTILS_LIBCALLEMITTER_H