diff --git a/test/Analysis/intel/test-axis-info.mlir b/test/Analysis/intel/test-axis-info.mlir index 1a3805f018..39dcd0bd3e 100644 --- a/test/Analysis/intel/test-axis-info.mlir +++ b/test/Analysis/intel/test-axis-info.mlir @@ -885,9 +885,11 @@ tt.func public @make_tensor_ptr(%arg0: !tt.ptr, %arg1: !tt.ptr {tt. %c1_i64 = arith.constant 1 : i64 %c32_i64 = arith.constant 32 : i64 %c128_i64 = arith.constant 128 : i64 - // CHECK: %0 = tt.make_tensor_ptr %arg0, {{.*}} => contiguity = [128, 32], divisibility = [1, 1], constancy = [1, 1], constant_value = + // CHECK: tt.make_tensor_ptr %arg0, {{.*}} => contiguity = [128, 32], divisibility = [1, 1], constancy = [1, 1], constant_value = %0 = tt.make_tensor_ptr %arg0, [%c128_i64, %c32_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array} : !tt.ptr> - // CHECK: %1 = tt.make_tensor_ptr %arg1, {{.*}} => contiguity = [32, 1], divisibility = [16, 1], constancy = [1, 1], constant_value = + // CHECK: tt.make_tensor_ptr %arg1, {{.*}} => contiguity = [64, 1], divisibility = [16, 1], constancy = [1, 1], constant_value = %1 = tt.make_tensor_ptr %arg1, [%c32_i64, %c32_i64], [%c1_i64, %arg2], [%c0_i32, %c0_i32] {order = array} : > + // CHECK: tt.make_tensor_ptr %arg1, {{.*}} => contiguity = [32, 64], divisibility = [1, 1], constancy = [1, 1], constant_value = + %2 = tt.make_tensor_ptr %arg1, [%arg2, %c128_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > tt.return } diff --git a/test/TritonIntelGPU/coalesce.mlir b/test/TritonIntelGPU/coalesce.mlir new file mode 100644 index 0000000000..d9b2de454c --- /dev/null +++ b/test/TritonIntelGPU/coalesce.mlir @@ -0,0 +1,338 @@ +// RUN: triton-opt %s -split-input-file -tritonintelgpu-coalesce | FileCheck %s + +#blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> +#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1]}> +#slice1dim1 = #triton_gpu.slice<{dim = 1, parent = #blocked1}> +#slice2dim0 = #triton_gpu.slice<{dim = 0, parent = #blocked2}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { + +// CHECK: [[row_layout:#.*]] = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}> +// CHECK: [[col_layout:#.*]] = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}> +// CHECK: [[load_ptr:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64x!tt.ptr, [[row_layout]]> +// CHECK: [[load_mask:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xi1, [[row_layout]]> +// CHECK: [[load_other:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xf32, [[row_layout]]> +// CHECK: [[load_val:%.*]] = tt.load [[load_ptr]], [[load_mask]], [[load_other]] : tensor<64x64x!tt.ptr, [[row_layout]]> +// CHECK: [[store_ptr:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64x!tt.ptr, [[col_layout]]> +// CHECK: [[store_val:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xf32, [[col_layout]]> +// CHECK: [[store_mask:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xi1, [[col_layout]]> +// CHECK: tt.store [[store_ptr]], [[store_val]], [[store_mask]] +tt.func @transpose(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, + %arg1: i32 {tt.divisibility = 16 : i32}, + %arg2: !tt.ptr {tt.divisibility = 16 : i32}, + %arg3: i32 {tt.divisibility = 16 : i32}) { + %cst = arith.constant dense : tensor<64x64xi1, #blocked1> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked1> + %00 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice1dim1> + %01 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #slice2dim0> + %1 = tt.expand_dims %00 {axis = 1 : i32} : tensor<64xi32, #slice1dim1> -> tensor<64x1xi32, #blocked1> + %2 = tt.splat %arg1 : i32 -> tensor<64x1xi32, #blocked1> + %3 = arith.muli %1, %2 : tensor<64x1xi32, #blocked1> + %4 = tt.splat %arg0 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> + %5 = tt.addptr %4, %3 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> + %6 = tt.expand_dims %01 {axis = 0 : i32} : tensor<64xi32, #slice2dim0> -> tensor<1x64xi32, #blocked2> + %7 = tt.broadcast %5 : tensor<64x1x!tt.ptr, #blocked1> -> tensor<64x64x!tt.ptr, #blocked1> + %8 = tt.broadcast %6 : tensor<1x64xi32, #blocked2> -> tensor<64x64xi32, #blocked2> + %9 = triton_gpu.convert_layout %8 : tensor<64x64xi32, #blocked2> -> tensor<64x64xi32, #blocked1> + %10 = tt.addptr %7, %9 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> + %11 = tt.splat %arg2 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> + %12 = tt.addptr %11, %1 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> + %13 = tt.splat %arg3 : i32 -> tensor<1x64xi32, #blocked2> + %14 = arith.muli %6, %13 : tensor<1x64xi32, #blocked2> + %15 = tt.broadcast %12 : tensor<64x1x!tt.ptr, #blocked1> -> tensor<64x64x!tt.ptr, #blocked1> + %16 = tt.broadcast %14 : tensor<1x64xi32, #blocked2> -> tensor<64x64xi32, #blocked2> + %17 = triton_gpu.convert_layout %16 : tensor<64x64xi32, #blocked2> -> tensor<64x64xi32, #blocked1> + %18 = tt.addptr %15, %17 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> + %19 = tt.load %10, %cst, %cst_0 : tensor<64x64x!tt.ptr, #blocked1> + tt.store %18, %19, %cst : tensor<64x64x!tt.ptr, #blocked1> + tt.return +} + +} + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + + +// CHECK: [[NARROW_LAYOUT:#.*]] = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +// CHECK: [[WIDE_LAYOUT:#.*]] = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +tt.func public @load_tensors_two_types(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked> + %5 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> + %6 = arith.cmpi "slt", %4, %5 : tensor<1024xi32, #blocked> + %7 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %8 = tt.addptr %7, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %9 = tt.load %8, %6 : tensor<1024x!tt.ptr, #blocked> + %10 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %12 = tt.load %11, %6 : tensor<1024x!tt.ptr, #blocked> + %13 = arith.extf %12 : tensor<1024xf16, #blocked> to tensor<1024xf32, #blocked> + %14 = arith.addf %9, %13 : tensor<1024xf32, #blocked> + %15 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %16 = tt.addptr %15, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + // CHECK: tt.store {{.*}} : tensor<1024x!tt.ptr, [[WIDE_LAYOUT]]> + tt.store %16, %14, %6 : tensor<1024x!tt.ptr, #blocked> + tt.return +} + +} + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-NOT: sizePerThread = [4] +// CHECK: #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +// CHECK-NOT: sizePerThread = [4] +tt.func public @load_tensors_two_types(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked> + %5 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> + %6 = arith.cmpi "slt", %4, %5 : tensor<1024xi32, #blocked> + %7 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %8 = tt.addptr %7, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %9 = tt.load %8, %6 : tensor<1024x!tt.ptr, #blocked> + %10 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %12 = tt.load %11, %6 : tensor<1024x!tt.ptr, #blocked> + %13 = arith.extf %12 : tensor<1024xf16, #blocked> to tensor<1024xf32, #blocked> + %14 = arith.addf %9, %13 : tensor<1024xf32, #blocked> + %15 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %16 = tt.addptr %15, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %17 = arith.truncf %14 : tensor<1024xf32, #blocked> to tensor<1024xf16, #blocked> + tt.store %16, %17, %6 : tensor<1024x!tt.ptr, #blocked> + tt.return +} + +} + +// ----- + +// COM: Reproducer for issue #3866 +// CHECK-LABEL: @test_3866 +// CHECK: tt.load {{.*}} : !tt.ptr +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { + tt.func public @test_3866(%arg0: !tt.ptr, %arg1: i32, %arg2: i64) { + %0 = tt.make_tensor_ptr %arg0, [%arg2, %arg2], [%arg2, %arg2], [%arg1, %arg1] {order = array} : > + %1 = tt.load %0 : !tt.ptr> + tt.return + } +} + +// ----- + +// COM: Test coalescing on blocked pointers: coalescable load using block pointer in a SCF for loop. + +#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [4], order = [0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}> +#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 1], order = [1, 0]}> +#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> +#dot1 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}> +#dot2 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { + // CHECK: [[BLOCKED_LAYOUT1:#.*]] = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 8], warpsPerCTA = [4, 1], order = [1, 0]}> + // CHECK: [[BLOCKED_LAYOUT2:#.*]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [4, 4], warpsPerCTA = [1, 4], order = [0, 1]}> + // CHECK: @test_block_ptrs + tt.func public @test_block_ptrs(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: f32, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}, %arg9: i32, %arg10: i32, %arg11: i32 {tt.divisibility = 16 : i32}, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32 {tt.divisibility = 16 : i32}, %arg18: i32, %arg19: i32, %arg20: i32) { + %cst = arith.constant dense<0.000000e+00> : tensor<8x16xf32, #dpas> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<8xf32, #blocked> + %cst_1 = arith.constant dense<0xFF800000> : tensor<8xf32, #blocked> + %c1_i32 = arith.constant 1 : i32 + %c16_i32 = arith.constant 16 : i32 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<8x64xf32, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %c1_i64 = arith.constant 1 : i64 + %c64_i64 = arith.constant 64 : i64 + %c8_i32 = arith.constant 8 : i32 + %0 = tt.get_program_id x : i32 + %1 = tt.get_program_id y : i32 + %2 = arith.divsi %1, %arg19 : i32 + %3 = arith.remsi %1, %arg19 : i32 + %4 = arith.extsi %2 : i32 to i64 + %5 = arith.extsi %arg6 : i32 to i64 + %6 = arith.muli %4, %5 : i64 + %7 = arith.extsi %3 : i32 to i64 + %8 = arith.extsi %arg7 : i32 to i64 + %9 = arith.muli %7, %8 : i64 + %10 = arith.addi %6, %9 : i64 + %11 = tt.addptr %arg0, %10 : !tt.ptr, i64 + %12 = arith.muli %0, %c8_i32 : i32 + %13 = arith.extsi %arg20 : i32 to i64 + %14 = arith.extsi %arg8 : i32 to i64 + // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : + %15 = tt.make_tensor_ptr %11, [%13, %c64_i64], [%14, %c1_i64], [%12, %c0_i32] {order = array} : > + %16 = tt.addptr %arg1, %10 : !tt.ptr, i64 + %17 = arith.extsi %arg11 : i32 to i64 + // CHECK: [[PTR2:%.*]] = tt.make_tensor_ptr {{.*}} : + %18 = tt.make_tensor_ptr %16, [%c64_i64, %13], [%c1_i64, %17], [%c0_i32, %c0_i32] {order = array} : > + %19 = tt.addptr %arg5, %10 : !tt.ptr, i64 + %20 = arith.extsi %arg17 : i32 to i64 + // CHECK: [[PTR3:%.*]] = tt.make_tensor_ptr {{.*}} : + %21 = tt.make_tensor_ptr %19, [%13, %c64_i64], [%20, %c1_i64], [%12, %c0_i32] {order = array} : > + %22 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked> + %23 = tt.splat %12 : i32 -> tensor<8xi32, #blocked> + %24 = arith.addi %23, %22 : tensor<8xi32, #blocked> + // CHECK: [[LOAD1:%.*]] = tt.load [[PTR1]] : !tt.ptr + // CHECK-NEXT: triton_gpu.convert_layout [[LOAD1]] : tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]> -> tensor<8x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %25 = tt.load %15 : !tt.ptr> + %26 = arith.addi %0, %c1_i32 : i32 + %27 = arith.muli %26, %c8_i32 : i32 + // CHECK: [[ADVANCE1:%.*]] = tt.advance [[PTR2]], {{.*}} : > + %28 = tt.advance %18, [%c0_i32, %12] : > + // CHECK: [[RES:%.*:2]] = scf.for {{.*}} iter_args(%arg22 = %cst_1, %arg23 = [[ADVANCE1]]) -> (tensor<8xf32, #blocked>, !tt.ptr>) + %29:2 = scf.for %arg21 = %12 to %27 step %c16_i32 iter_args(%arg22 = %cst_1, %arg23 = %28) -> (tensor<8xf32, #blocked>, !tt.ptr>) : i32 { + // CHECK: [[LOAD2:%.*]] = tt.load %arg23 : !tt.ptr> + // CHECK-NEXT: triton_gpu.convert_layout [[LOAD2]] : tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]> -> tensor<64x16xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %36 = tt.load %arg23 : !tt.ptr> + %37 = tt.fp_to_fp %25 : tensor<8x64xf8E5M2, #dot1> -> tensor<8x64xf16, #dot1> + %38 = tt.fp_to_fp %36 : tensor<64x16xf8E5M2, #dot2> -> tensor<64x16xf16, #dot2> + %39 = tt.dot %37, %38, %cst, inputPrecision = tf32 : tensor<8x64xf16, #dot1> * tensor<64x16xf16, #dot2> -> tensor<8x16xf32, #dpas> + %40 = triton_gpu.convert_layout %39 : tensor<8x16xf32, #dpas> -> tensor<8x16xf32, #blocked2> + %41 = "tt.reduce"(%40) <{axis = 1 : i32}> ({ + ^bb0(%arg24: f32, %arg25: f32): + %44 = arith.maxnumf %arg24, %arg25 : f32 + tt.reduce.return %44 : f32 + }) : (tensor<8x16xf32, #blocked2>) -> tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> + %42 = triton_gpu.convert_layout %41 : tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<8xf32, #blocked> + // CHECK: [[ADVANCE2:%.*]] = tt.advance %arg23, {{.*}} : > + // CHECK-NEXT: scf.yield {{.*}}, [[ADVANCE2]] : tensor<8xf32, #blocked>, !tt.ptr> + %43 = tt.advance %arg23, [%c0_i32, %c16_i32] : > + scf.yield %42, %43 : tensor<8xf32, #blocked>, !tt.ptr> + } {tt.divisibility_arg1 = dense<16> : tensor<1xi32>} + %30 = arith.addf %29#0, %cst_0 : tensor<8xf32, #blocked> + %31 = arith.muli %1, %arg20 : i32 + %32 = tt.addptr %arg4, %31 : !tt.ptr, i32 + %33 = tt.splat %32 : !tt.ptr -> tensor<8x!tt.ptr, #blocked> + %34 = tt.addptr %33, %24 : tensor<8x!tt.ptr, #blocked>, tensor<8xi32, #blocked> + tt.store %34, %30 : tensor<8x!tt.ptr, #blocked> + %35 = tt.fp_to_fp %cst_2, rounding = rtne : tensor<8x64xf32, #blocked1> -> tensor<8x64xf8E5M2, #blocked1> + tt.store %21, %35 : !tt.ptr> + tt.return + } +} + +// ----- + +// COM: Test coalescing on blocked pointers: loop results used by another loop. + +#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [4], order = [0]}> +#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> +#dot2 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { + // CHECK: [[BLOCKED_LAYOUT:#.*]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}> + // CHECK: @test_block_ptrs + tt.func public @test_block_ptrs(%arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32}, %arg19: i32) { + %cst_1 = arith.constant dense<1.000000e+00> : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>> + %c32_i32 = arith.constant 32 : i32 + %c64_i32 = arith.constant 64 : i32 + %c64_i64 = arith.constant 64 : i64 + %c1_i64 = arith.constant 1 : i64 + %c0_i32 = arith.constant 0 : i32 + %0 = tt.get_program_id x : i32 + %1 = tt.get_program_id y : i32 + %2 = arith.divsi %1, %arg19 : i32 + %3 = arith.remsi %1, %arg19 : i32 + %4 = arith.extsi %2 : i32 to i64 + %5 = arith.extsi %arg6 : i32 to i64 + %6 = arith.muli %4, %5 : i64 + %7 = arith.extsi %3 : i32 to i64 + %8 = arith.extsi %arg7 : i32 to i64 + %9 = arith.muli %7, %8 : i64 + %10 = arith.addi %6, %9 : i64 + %12 = arith.muli %0, %c64_i32 : i32 + %13 = arith.extsi %arg19 : i32 to i64 + %19 = tt.addptr %arg1, %10 : !tt.ptr, i64 + %20 = arith.extsi %arg11 : i32 to i64 + // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : + %21 = tt.make_tensor_ptr %19, [%c64_i64, %13], [%c1_i64, %20], [%c0_i32, %c0_i32] {order = array} : > + // CHECK: [[RES:%.*]]:2 = scf.for {{.*}} iter_args([[ARG1:%.*]] = %cst, [[ARG2:%.*]] = [[PTR1]]) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr>) + %33:2 = scf.for %arg21 = %c0_i32 to %12 step %c32_i32 iter_args(%arg22 = %cst_1, %arg23 = %21) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr>) : i32 { + // CHECK: [[LOAD:%.*]] = tt.load [[ARG2]] : !tt.ptr> + // CHECK-NEXT: triton_gpu.convert_layout [[LOAD]] : tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]> -> tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + // CHECK-NEXT: scf.yield [[ARG1]], [[ARG2]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr> + %load = tt.load %arg23 : !tt.ptr> + scf.yield %arg22, %arg23 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr> + } + // CHECK: scf.for {{.*}} iter_args([[ARG1:%.*]] = [[RES]]#0, [[ARG2:%.*]] = [[RES]]#1) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr>) + %34:2 = scf.for %arg21 = %c0_i32 to %12 step %c32_i32 iter_args(%arg22 = %33#0, %arg23 = %33#1) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr>) : i32 { + // CHECK: scf.yield [[ARG1]], [[ARG2]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr> + scf.yield %arg22, %arg23 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr> + } + tt.return + } +} + +// ----- + +// COM: Test coalescing on blocked pointers: loop with 2 output blocked pointers. + +#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [4], order = [0]}> +#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { + // CHECK: [[BLOCKED_LAYOUT:#.*]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [4, 4], warpsPerCTA = [1, 4], order = [0, 1]}> + // CHECK: @test_block_ptrs + tt.func public @test_block_ptrs(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32, %arg11: i32 {tt.divisibility = 16 : i32}, %arg14: i32, %arg19: i32, %arg20: i32) { + %c32_i32 = arith.constant 32 : i32 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> + %c64_i32 = arith.constant 64 : i32 + %c64_i64 = arith.constant 64 : i64 + %c1_i64 = arith.constant 1 : i64 + %c0_i32 = arith.constant 0 : i32 + %0 = tt.get_program_id x : i32 + %1 = tt.get_program_id y : i32 + %2 = arith.divsi %1, %arg19 : i32 + %3 = arith.remsi %1, %arg19 : i32 + %4 = arith.extsi %2 : i32 to i64 + %5 = arith.extsi %arg6 : i32 to i64 + %6 = arith.muli %4, %5 : i64 + %7 = arith.extsi %3 : i32 to i64 + %8 = arith.extsi %arg7 : i32 to i64 + %9 = arith.muli %7, %8 : i64 + %10 = arith.addi %6, %9 : i64 + %11 = tt.addptr %arg0, %10 : !tt.ptr, i64 + %12 = arith.muli %0, %c64_i32 : i32 + %13 = arith.extsi %arg20 : i32 to i64 + %14 = arith.extsi %arg8 : i32 to i64 + %15 = tt.make_tensor_ptr %11, [%13, %c64_i64], [%14, %c1_i64], [%12, %c0_i32] {order = array} : >> + %16 = tt.addptr %arg2, %10 : !tt.ptr, i64 + %17 = arith.extsi %arg14 : i32 to i64 + // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : >> + %18 = tt.make_tensor_ptr %16, [%13, %c64_i64], [%c1_i64, %17], [%c0_i32, %c0_i32] {order = array} : >> + %19 = tt.addptr %arg1, %10 : !tt.ptr, i64 + %20 = arith.extsi %arg11 : i32 to i64 + // CHECK: [[PTR2:%.*]] = tt.make_tensor_ptr {{.*}} : + %21 = tt.make_tensor_ptr %19, [%c64_i64, %13], [%c1_i64, %20], [%c0_i32, %c0_i32] {order = array} : >> + %32 = tt.load %15 : !tt.ptr>> + // CHECK: scf.for {{.*}} iter_args([[ARG1:%.*]] = [[PTR2]], [[ARG2:%.*]] = [[PTR1]]) -> (!tt.ptr>, !tt.ptr>>) + %35:2 = scf.for %arg21 = %c0_i32 to %12 step %c32_i32 iter_args(%arg25 = %21, %arg26 = %18) -> (!tt.ptr>>, !tt.ptr>>) : i32 { + // CHECK: [[LOAD:%.*]] = tt.load [[ARG1]] : !tt.ptr> + // CHECK-NEXT: triton_gpu.convert_layout [[LOAD]] : tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]> -> tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %58 = tt.load %arg25 : !tt.ptr>> + %59 = tt.fp_to_fp %32 : tensor<64x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %60 = tt.fp_to_fp %58 : tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %61 = tt.dot %59, %60, %cst_2, inputPrecision = tf32 : tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x32xf32, #mma> + // CHECK-DAG: [[ADVANCE1:%.*]] = tt.advance [[ARG1]], {{.*}} : > + // CHECK-DAG: [[ADVANCE2:%.*]] = tt.advance [[ARG2]], {{.*}} : >> + // CHECK-NEXT: scf.yield [[ADVANCE1]], [[ADVANCE2]] : !tt.ptr>, !tt.ptr>> + %84 = tt.advance %arg26, [%c32_i32, %c0_i32] : >> + %85 = tt.advance %arg25, [%c0_i32, %c32_i32] : >> + scf.yield %85, %84 : !tt.ptr>>, !tt.ptr>> + } + tt.return + } +} diff --git a/third_party/intel/include/Analysis/AxisInfo.h b/third_party/intel/include/Analysis/AxisInfo.h index 3016e02cad..1fbaba2e0c 100644 --- a/third_party/intel/include/Analysis/AxisInfo.h +++ b/third_party/intel/include/Analysis/AxisInfo.h @@ -11,7 +11,6 @@ #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include -#include namespace mlir::triton::intel { diff --git a/third_party/intel/lib/Analysis/AxisInfo.cpp b/third_party/intel/lib/Analysis/AxisInfo.cpp index 879b53e49f..378ba01442 100644 --- a/third_party/intel/lib/Analysis/AxisInfo.cpp +++ b/third_party/intel/lib/Analysis/AxisInfo.cpp @@ -1008,49 +1008,49 @@ class MakeTensorPtrOpAxisInfoVisitor final ArrayRef *> operands) override { LDBG("MakeTensorPtrOpAxisInfoVisitor: " << *op); - // TODO: Extend to higher dimension tensor pointers. - if (op.getShape().size() != 2) + auto ptrTy = cast(op.getResult().getType()); + auto tensorType = cast(ptrTy.getPointeeType()); + ArrayRef blkShape = tensorType.getShape(); + unsigned rank = op.getShape().size(); + + // TODO: Support higher rank tensors. + if (rank > 2) return AxisInfo(); - assert(operands.size() == 7 && "MakeTensorPtrOp should have 2D shape"); + SmallVector strideInfo; + for (int i = rank + 1; i <= rank * 2; ++i) + strideInfo.emplace_back(operands[i]->getValue()); AxisInfo ptrInfo = operands[0]->getValue(); - AxisInfo shapeInfo0 = operands[1]->getValue(); - AxisInfo shapeInfo1 = operands[2]->getValue(); - AxisInfo strideInfo0 = operands[3]->getValue(); - AxisInfo strideInfo1 = operands[4]->getValue(); - - std::optional shape0 = shapeInfo0.getConstantValue(); - std::optional shape1 = shapeInfo1.getConstantValue(); - std::optional stride0 = strideInfo0.getConstantValue(); - std::optional stride1 = strideInfo1.getConstantValue(); - - AxisInfo::DimVectorT contiguity{ - shape0.has_value() && (stride0 == 1) ? shape0.value() : 1, - shape1.has_value() && (stride1 == 1) ? shape1.value() : 1}; - - int64_t ptrDivisibility = ptrInfo.getDivisibility()[0]; - int64_t strideDivisibility0 = strideInfo0.getDivisibility()[0]; - int64_t strideDivisibility1 = strideInfo1.getDivisibility()[0]; - - LDBG("ptrDivisibility: " << ptrDivisibility); - LDBG("strideDivisibility0: " << strideDivisibility0); - LDBG("strideDivisibility1: " << strideDivisibility1); - - AxisInfo::DimVectorT divisibility{1, 1}; - if (ptrDivisibility > 1) { - if (contiguity[0] > 1) - divisibility[0] = std::min(ptrDivisibility, strideDivisibility1); - if (contiguity[1] > 1) - divisibility[1] = std::min(ptrDivisibility, strideDivisibility0); + int64_t ptrDivisibility = ptrInfo.getDivisibility(0); + + AxisInfo::DimVectorT contiguity, constancy, divisibility; + for (int dim = 0; dim < rank; ++dim) { + contiguity.push_back( + strideInfo[dim].getConstantValue() == 1 ? blkShape[dim] : 1); + divisibility.push_back( + contiguity[dim] > 1 + ? std::min(ptrDivisibility, + strideInfo[dim == 0 ? 1 : 0].getDivisibility()[0]) + : 1); + constancy.push_back(1); } - AxisInfo::DimVectorT constancy{1, 1}; - return AxisInfo(contiguity, divisibility, constancy); } }; +class AdvanceOpAxisInfoVisitor final + : public AxisInfoVisitorImpl { +public: + using AxisInfoVisitorImpl::AxisInfoVisitorImpl; + AxisInfo + getAxisInfo(triton::AdvanceOp op, + ArrayRef *> operands) override { + return operands[0]->getValue(); + } +}; + //===----------------------------------------------------------------------===// // AxisInfoAnalysis //===----------------------------------------------------------------------===// @@ -1099,6 +1099,7 @@ AxisInfoAnalysis::AxisInfoAnalysis(DataFlowSolver &solver) MaxMinOpAxisInfoVisitor>(); visitors.append(); visitors.append(); + visitors.append(); } LogicalResult AxisInfoAnalysis::visitOperation( diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp index b213bc4438..7f52090f4e 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp @@ -1,13 +1,16 @@ #include "intel/include/Analysis/AxisInfo.h" #include "intel/include/Dialect/TritonIntelGPU/IR/Utils.h" #include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h" -#include "mlir/Analysis/SliceAnalysis.h" +#include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "mlir/IR/Verifier.h" #include "mlir/Support/LLVM.h" #include "triton/Dialect/Triton/IR/Utility.h" -#include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" #include "triton/Tools/StrUtil.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "tritonintelgpu-coalesce" #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") @@ -20,18 +23,20 @@ namespace mlir::triton::gpu::intel { using namespace mlir; namespace tt = mlir::triton; +namespace ttg = mlir::triton::gpu; namespace ttgi = mlir::triton::gpu::intel; namespace { struct CoalescePass : public ttgi::impl::TritonIntelGPUCoalesceBase { +private: void setCoalescedEncoding(tt::intel::ModuleAxisInfoAnalysis &axisInfoAnalysis, Operation *op, int numWarps, int threadsPerWarp, llvm::MapVector &layoutMap) { Value ptr = getMemAccessPtr(op); - auto refTensorType = cast(ptr.getType()); + LDBG("ptr: " << ptr); LDBG("Considering op: " << *op); LLVM_DEBUG({ @@ -44,6 +49,7 @@ struct CoalescePass SmallVector order = argSort(contiguity); LDBG("order=[" << triton::join(order, ", ") << "]"); + RankedTensorType refTensorType = ttgi::getRankedTensorType(ptr.getType()); auto matchesShape = [&refTensorType](const Value &val) { auto rttType = dyn_cast(val.getType()); return rttType && rttType.getShape() == refTensorType.getShape(); @@ -67,12 +73,11 @@ struct CoalescePass } } - auto shapePerCTA = triton::gpu::getShapePerCTA(refTensorType); + auto shapePerCTA = ttg::getShapePerCTA(refTensorType); LDBG("shapePerCTA=[" << triton::join(shapePerCTA, ", ") << "]"); int numElems = product(shapePerCTA); int numThreads = numWarps * threadsPerWarp; - unsigned perThread = ttgi::getNumElementsPerThread(op, order, axisInfoAnalysis); LDBG("perThread for op: " << perThread); @@ -102,33 +107,207 @@ struct CoalescePass SmallVector sizePerThread(refTensorType.getRank(), 1); sizePerThread[order[0]] = perThread; - auto CTALayout = triton::gpu::getCTALayout(refTensorType.getEncoding()); - layoutMap[op] = triton::gpu::BlockedEncodingAttr::get( + auto CTALayout = ttg::getCTALayout(refTensorType.getEncoding()); + layoutMap[op] = ttg::BlockedEncodingAttr::get( &getContext(), refTensorType.getShape(), sizePerThread, order, numWarps, threadsPerWarp, CTALayout); } - static Type getNewType(Type type, Attribute encoding) { - RankedTensorType tensorType = cast(type); + static RankedTensorType getNewType(RankedTensorType tensorType, + Attribute encoding) { return RankedTensorType::get(tensorType.getShape(), tensorType.getElementType(), encoding); } + // Find the defining makeTensorPtrOp operation of the given value. + static std::optional + findDefiningMakeTensorPtrOp(Value val) { + if (auto arg = dyn_cast(val)) { + Operation *parentOp = val.getParentBlock()->getParentOp(); + assert(isa(parentOp) && "Expected a scf::ForOp"); + auto loopArg = + cast(parentOp).getInitArgs()[arg.getArgNumber() - 1]; + return findDefiningMakeTensorPtrOp(loopArg); + } + + if (auto advanceOp = val.getDefiningOp()) + return findDefiningMakeTensorPtrOp(advanceOp.getPtr()); + if (auto makePtrOp = val.getDefiningOp()) + return makePtrOp; + + return std::nullopt; + } + + static bool filterUser(Operation *op) { + // Yield operations trigger updating the layout of the containing loop + // results, don't skip them. + if (isa(op)) + return false; + + // Skip operations that don't yield a result and contain no regions. + if (op->getNumResults() == 0 && op->getNumRegions() == 0) + return true; + + // Operations that do not yield a block pointer aren't interesting. + if (op->getNumRegions() == 0 && + llvm::none_of(op->getResultTypes(), [](Type resType) { + return tt::isTensorPointerType(resType); + })) + return true; + + return false; + } + + // Change the \p layout of the \p op result and propagate the new result type + // to its users. + void changeAndPropagateLayout(Operation *op, Attribute layout, + IRRewriter &rewriter) const { + assert(op && op->getNumResults() == 1 && + "Expecting operation yielding a result"); + + rewriter.modifyOpInPlace(op, [&]() { + Value res = op->getOpResult(0); + assert(tt::isTensorPointerType(res.getType()) && + "Expecting a block pointer"); + + auto ptrType = cast(res.getType()); + auto tensorType = cast(ptrType.getPointeeType()); + res.setType(tt::PointerType::get(getNewType(tensorType, layout), + ptrType.getAddressSpace())); + }); + LDBG("Coalesced op: " << *op); + + propagateLayout(op, layout, rewriter); + } + + // Propagate the layout of the \p root operation's result to its users. + void propagateLayout(Operation *root, Attribute layout, + IRRewriter &rewriter) const { + assert(root->getNumResults() != 0 && + "Expecting an operation yielding a result"); + + LDBG("root: " << *root); + for (Operation *user : root->getUsers()) { + if (filterUser(user)) + continue; + + LDBG("root's user: " << *user << "\n"); + if (auto forOp = dyn_cast(user)) { + propagateLayoutToArgsAndBody(forOp, root, layout, rewriter); + continue; + } + if (auto yieldOp = dyn_cast(user)) { + auto forOp = yieldOp->getParentOfType(); + propagateLayoutToLoopResults(forOp, layout, rewriter); + continue; + } + changeAndPropagateLayout(user, layout, rewriter); + } + } + + // Propagate the layout of the \p arg block argument to its users. + void propagateLayout(BlockArgument arg, Attribute layout, + IRRewriter &rewriter) const { + LDBG("arg: " << arg); + for (Operation *user : arg.getUsers()) { + if (filterUser(user)) + continue; + + LDBG("arg's user: " << *user << "\n"); + if (auto yieldOp = dyn_cast(user)) { + auto forOp = yieldOp->getParentOfType(); + propagateLayoutToLoopResults(forOp, layout, rewriter); + continue; + } + changeAndPropagateLayout(user, layout, rewriter); + } + } + + // Propagate the layout of the \p root operation's result to the \p forOp loop + // init argument that uses it, and transitively to the operations in the loop + // body that use that argument. + void propagateLayoutToArgsAndBody(scf::ForOp forOp, Operation *root, + Attribute layout, + IRRewriter &rewriter) const { + assert(llvm::any_of(root->getUsers(), + [&](Operation *user) { return user == forOp; }) && + "Expecting the loop to be a user of the root operation"); + + for (BlockArgument arg : forOp.getRegionIterArgs()) { + Value loopArg = forOp.getInitArgs()[arg.getArgNumber() - 1]; + for (OpResult res : root->getResults()) { + if (res != loopArg || !tt::isTensorPointerType(res.getType())) + continue; + + LDBG("loopArg: " << loopArg); + + // Modify the layout of the loop init argument... + tt::PointerType ptrType = cast(arg.getType()); + auto tensorType = cast(ptrType.getPointeeType()); + arg.setType(tt::PointerType::get(getNewType(tensorType, layout), + ptrType.getAddressSpace())); + + // ... and then propagate it to the operations in the loop. + propagateLayout(arg, layout, rewriter); + } + } + } + + // Modify the given loop \p forOp and propagate the result of the enclosing + // loop. + void propagateLayoutToLoopResults(scf::ForOp forOp, Attribute layout, + IRRewriter &rewriter) const { + Operation *yieldOp = forOp.getBody()->getTerminator(); + + rewriter.modifyOpInPlace(forOp, [&]() { + for (auto [opType, res] : + llvm::zip(yieldOp->getOperandTypes(), forOp.getResults())) { + if (opType == res.getType()) + continue; + + assert(tt::isTensorPointerType(res.getType()) && + tt::isTensorPointerType(opType) && "Expecting blocked pointers"); + assert(cast( + cast(opType).getPointeeType()) + .getEncoding() == layout && + "Unexpected layout"); + + auto resType = cast(res.getType()); + RankedTensorType tensorType = ttgi::getRankedTensorType(resType); + res.setType(tt::PointerType::get(getNewType(tensorType, layout), + resType.getAddressSpace())); + } + }); + + propagateLayout(forOp, layout, rewriter); + } + void coalesceOp(Attribute encoding, Operation *op) { + LDBG("Coalescing op: " << *op); + OpBuilder builder(op); + // Convert operands - // For load/store with tensor pointers, we don't have to change the - // operands' type, we do this by changing the outputs' type of - // `make_tensor_ptr` + // Note: for load/store with a blocked pointers argument we cannot change + // the operand type, instead we change the output type of + // `make_tensor_ptr` and propagate the new output type along the def-use + // chain. SmallVector newArgs; - for (auto operand : op->getOperands()) { + for (Value operand : op->getOperands()) { auto tensorType = dyn_cast(operand.getType()); if (tensorType && - !isa(tensorType.getEncoding())) { - Type newType = getNewType(tensorType, encoding); - newArgs.push_back(builder.create( + !isa(tensorType.getEncoding())) { + RankedTensorType newType = getNewType(tensorType, encoding); + newArgs.push_back(builder.create( op->getLoc(), newType, operand)); } else { + assert(isa(operand.getType()) && + "Expecting operand to have blocked pointer type"); + auto defOp = findDefiningMakeTensorPtrOp(operand); + assert(defOp && "Expected a make_tensor_ptr operation"); + LDBG("Found make_tensor_ptr definition: " << *defOp); + IRRewriter rewriter(builder); + changeAndPropagateLayout(*defOp, encoding, rewriter); newArgs.push_back(operand); } } @@ -136,27 +315,34 @@ struct CoalescePass // Convert output types SmallVector newTypes; for (auto t : op->getResultTypes()) { - bool isAsync = isa(op); - newTypes.push_back(isAsync ? t : getNewType(t, encoding)); + assert(!isa(op) && + "AsyncCopyGlobalToLocalOp not supported for Intel GPU"); + newTypes.push_back(getNewType(cast(t), encoding)); } - // Construct new op with the new encoding + // Construct new op with the new encoding. Operation *newOp = builder.create(op->getLoc(), op->getName().getIdentifier(), newArgs, newTypes, op->getAttrs()); - // Cast the results back to the original layout + // Cast the results back to the original layout. for (size_t i = 0; i < op->getNumResults(); i++) { Value newResult = newOp->getResult(i); if (newTypes[i] != op->getResultTypes()[i]) { - newResult = builder.create( + newResult = builder.create( op->getLoc(), op->getResult(i).getType(), newResult); } op->getResult(i).replaceAllUsesWith(newResult); } + + LDBG("Old op: " << *op); + LDBG("newOp: " << *newOp); op->erase(); + + assert(succeeded(verify(newOp)) && "Operation verification failed"); } +public: void runOnOperation() override { // Run axis info analysis ModuleOp moduleOp = getOperation(); @@ -169,20 +355,27 @@ struct CoalescePass Value ptr = getMemAccessPtr(curr); if (!ptr) return; - // We only convert `tensor>` load/store - bool isPtrTensor = false; - if (auto tensorType = dyn_cast(ptr.getType())) - isPtrTensor = isa(tensorType.getElementType()); - if (!isPtrTensor) + + RankedTensorType refTensorType = ttgi::getRankedTensorType(ptr.getType()); + if (!refTensorType || !refTensorType.getEncoding()) return; - auto mod = curr->getParentOfType(); - int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod); - int threadsPerWarp = - triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod); + + int numWarps = ttg::TritonGPUDialect::getNumWarps(moduleOp); + int threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(moduleOp); setCoalescedEncoding(axisInfoAnalysis, curr, numWarps, threadsPerWarp, layoutMap); }); + LLVM_DEBUG({ + DBGS() << "\nlayoutMap:" + << "\n"; + for (auto [op, encoding] : layoutMap) { + DBGS() << "op: " << *op << "\n"; + DBGS() << "encoding: " << encoding << "\n\n"; + } + llvm::errs() << "\n\n"; + }); + // For each memory op that has a layout L1: // 1. Create a coalesced memory layout L2 of the pointer operands // 2. Convert all operands from layout L1 to layout L2 @@ -190,9 +383,11 @@ struct CoalescePass // produces a tensor with layout L2 // 4. Convert the output of this new memory op back to L1 // 5. Replace all the uses of the original memory op by the new one - for (auto &kv : layoutMap) { - coalesceOp(kv.second, kv.first); + for (auto [op, layout] : layoutMap) { + coalesceOp(layout, op); } + + assert(succeeded(verify(moduleOp)) && "Module verification failed"); } };