|
| 1 | +// RUN: triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm --tritonintelgpu-rewrite-stack-ptr | FileCheck %s |
| 2 | + |
| 3 | +module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_bf16_conversion, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 0 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32} { |
| 4 | + // CHECK-LABEL: llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8> |
| 5 | + // CHECK-LABEL: llvm.func spir_kernelcc @kernel(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>) |
| 6 | + tt.func public @kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} { |
| 7 | + %0 = tt.load %arg0 : !tt.ptr<f32> |
| 8 | + %1 = tt.load %arg1 : !tt.ptr<f32> |
| 9 | + // CHECK: llvm.mlir.poison : !llvm.ptr<3> |
| 10 | + // CHECK: llvm.call @noinline_simple_fn__fp32_fp32_Pfp32__(%8, %17, %arg2, %18, %arg2) |
| 11 | + tt.call @noinline_simple_fn__fp32_fp32_Pfp32__(%0, %1, %arg2) : (f32, f32, !tt.ptr<f32>) -> () |
| 12 | + tt.return |
| 13 | + } |
| 14 | + // CHECK: llvm.func internal @noinline_simple_fn__fp32_fp32_Pfp32__(%arg0: f32, %arg1: f32, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<3>, %arg4: !llvm.ptr<1>) |
| 15 | + tt.func private @noinline_simple_fn__fp32_fp32_Pfp32__(%arg0: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg1: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg2: !tt.ptr<f32> {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 16 : i64}) attributes {noinline = true} { |
| 16 | + %0 = arith.addf %arg0, %arg1 fastmath<fast> : f32 |
| 17 | + tt.store %arg2, %0 : !tt.ptr<f32> |
| 18 | + tt.return |
| 19 | + } |
| 20 | +} |
| 21 | + |
| 22 | +// ----- |
| 23 | + |
| 24 | +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 4], warpsPerCTA = [1, 1], order = [1, 0]}> |
| 25 | +#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 1], A = [16, 8], B = [8, 16], C = [16, 16]}> |
| 26 | +#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> |
| 27 | +#smem = #ttg.shared_memory |
| 28 | +module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_bf16_conversion, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 1280 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} { |
| 29 | + // CHECK-LABEL: llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8> |
| 30 | + // CHECK-LABEL: llvm.func spir_kernelcc @kernel(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<3>) |
| 31 | + tt.func public @kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} { |
| 32 | + %0 = tt.load %arg0 : !tt.ptr<f32> |
| 33 | + %1 = tt.load %arg1 : !tt.ptr<f32> |
| 34 | + // CHECK: llvm.call @noinline_shared_fn__fp32_fp32_Pfp32__(%8, %17, %arg2, %arg3, %arg2) |
| 35 | + tt.call @noinline_shared_fn__fp32_fp32_Pfp32__(%0, %1, %arg2) {allocation.offset = 0 : i32} : (f32, f32, !tt.ptr<f32>) -> () |
| 36 | + tt.return |
| 37 | + } |
| 38 | + // CHECK: llvm.func internal @noinline_shared_fn__fp32_fp32_Pfp32__(%arg0: f32, %arg1: f32, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<3>, %arg4: !llvm.ptr<1>) |
| 39 | + // CHECK: llvm.getelementptr %arg3[{{.*}}] |
| 40 | + tt.func private @noinline_shared_fn__fp32_fp32_Pfp32__(%arg0: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg1: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg2: !tt.ptr<f32> {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 16 : i64}) attributes {noinline = true} { |
| 41 | + %cst = arith.constant dense<16> : tensor<16x1xi32, #blocked> |
| 42 | + %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> |
| 43 | + %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> |
| 44 | + %2 = arith.muli %1, %cst : tensor<16x1xi32, #blocked> |
| 45 | + %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> |
| 46 | + %4 = tt.expand_dims %3 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> |
| 47 | + %5 = tt.broadcast %2 : tensor<16x1xi32, #blocked> -> tensor<16x16xi32, #blocked> |
| 48 | + %6 = tt.broadcast %4 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> |
| 49 | + %7 = arith.addi %5, %6 : tensor<16x16xi32, #blocked> |
| 50 | + %8 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<16x16x!tt.ptr<f32>, #blocked> |
| 51 | + %9 = tt.addptr %8, %7 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked> |
| 52 | + %10 = tt.load %9 : tensor<16x16x!tt.ptr<f32>, #blocked> |
| 53 | + %11 = ttg.local_alloc %10 {allocation.offset = 0 : i32} : (tensor<16x16xf32, #blocked>) -> !ttg.memdesc<16x16xf32, #shared, #smem> |
| 54 | + %12 = tt.splat %arg0 : f32 -> tensor<16x16xf32, #mma> |
| 55 | + %13 = ttg.local_load %11 : !ttg.memdesc<16x16xf32, #shared, #smem> -> tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> |
| 56 | + %14 = ttg.local_load %11 : !ttg.memdesc<16x16xf32, #shared, #smem> -> tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> |
| 57 | + %15 = tt.dot %13, %14, %12, inputPrecision = tf32 : tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<16x16xf32, #mma> |
| 58 | + %16 = tt.splat %arg1 : f32 -> tensor<16x16xf32, #mma> |
| 59 | + %17 = arith.addf %15, %16 fastmath<fast> : tensor<16x16xf32, #mma> |
| 60 | + %18 = ttg.convert_layout %17 {allocation.offset = 0 : i32} : tensor<16x16xf32, #mma> -> tensor<16x16xf32, #blocked> |
| 61 | + tt.store %9, %18 : tensor<16x16x!tt.ptr<f32>, #blocked> |
| 62 | + tt.return |
| 63 | + } |
| 64 | +} |
0 commit comments