|
| 1 | +// RUN: triton-opt %s -split-input-file -triton-intel-stride-versioning | FileCheck %s |
| 2 | + |
| 3 | +module { |
| 4 | + tt.func public @version_for_loop(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %arg2: i64, %arg3: i64, %arg4: i64 {tt.divisibility = 16 : i32}, %arg5: i64) { |
| 5 | + %c64_i32 = arith.constant 64 : i32 |
| 6 | + %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32> |
| 7 | + %c256_i32 = arith.constant 256 : i32 |
| 8 | + %c32_i32 = arith.constant 32 : i32 |
| 9 | + %c0_i32 = arith.constant 0 : i32 |
| 10 | + %c4096_i32 = arith.constant 4096 : i32 |
| 11 | + %c8192_i32 = arith.constant 8192 : i32 |
| 12 | + %c4_i32 = arith.constant 4 : i32 |
| 13 | + %0 = tt.get_program_id x : i32 |
| 14 | + %1 = arith.divsi %0, %c64_i32 : i32 |
| 15 | + %2 = arith.muli %1, %c4_i32 : i32 |
| 16 | + %3 = arith.subi %c32_i32, %2 : i32 |
| 17 | + %4 = arith.minsi %3, %c4_i32 : i32 |
| 18 | + %5 = arith.remsi %0, %c64_i32 : i32 |
| 19 | + %6 = arith.remsi %5, %4 : i32 |
| 20 | + %7 = arith.addi %2, %6 : i32 |
| 21 | + %8 = arith.divsi %5, %4 : i32 |
| 22 | + %9 = arith.extsi %c8192_i32 : i32 to i64 |
| 23 | + %10 = arith.extsi %c4096_i32 : i32 to i64 |
| 24 | + %11 = tt.make_tensor_ptr %arg0, [%9, %10], [%arg2, %arg3], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<256x32xbf16>> |
| 25 | + %12 = tt.make_tensor_ptr %arg1, [%10, %10], [%arg4, %arg5], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x256xbf16>> |
| 26 | + %13 = arith.muli %7, %c256_i32 : i32 |
| 27 | + %14 = arith.muli %8, %c256_i32 : i32 |
| 28 | + %15:2 = scf.for %arg9 = %c0_i32 to %c4096_i32 step %c32_i32 iter_args(%arg10 = %cst, %arg11 = %c0_i32) -> (tensor<256x256xf32>, i32) : i32 { |
| 29 | + %20 = tt.advance %11, [%13, %arg11] : <tensor<256x32xbf16>> |
| 30 | + %21 = tt.load %20 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xbf16>> |
| 31 | + %22 = tt.advance %12, [%arg11, %14] : <tensor<32x256xbf16>> |
| 32 | + %23 = tt.load %22 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xbf16>> |
| 33 | + %24 = tt.dot %21, %23, %cst, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32> |
| 34 | + %25 = arith.addf %arg10, %24 : tensor<256x256xf32> |
| 35 | + %26 = arith.addi %arg11, %c32_i32 : i32 |
| 36 | + scf.yield %25, %26 : tensor<256x256xf32>, i32 |
| 37 | + } |
| 38 | + tt.return |
| 39 | + } |
| 40 | + |
| 41 | + // CHECK: tt.func public @version_for_loop |
| 42 | + // CHECK: [[CST_1_i64:%.+]] = arith.constant 1 : i64 |
| 43 | + // CHECK-DAG: [[NEW_PTR1:%.+]] = tt.make_tensor_ptr %arg0, {{.*}}, [%arg2, %c1_i64], {{.*}} {order = array<i32: 1, 0>} : <tensor<256x32xbf16>> |
| 44 | + // CHECK-DAG: [[ORIG_PTR1:%.+]] = tt.make_tensor_ptr %arg0, {{.*}}, [%arg2, %arg3], {{.*}} {order = array<i32: 1, 0>} : <tensor<256x32xbf16>> |
| 45 | + // CHECK: [[NEW_PTR2:%.+]] = tt.make_tensor_ptr %arg1, {{.*}}, [%arg4, %c1_i64], {{.*}} {order = array<i32: 1, 0>} : <tensor<32x256xbf16>> |
| 46 | + // CHECK: [[ORIG_PTR2:%.+]] = tt.make_tensor_ptr %arg1, {{.*}}, [%arg4, %arg5], {{.*}} {order = array<i32: 1, 0>} : <tensor<32x256xbf16>> |
| 47 | + // CHECK-DAG: [[CMP1:%.+]] = arith.cmpi eq, %arg3, [[CST_1_i64]] : i64 |
| 48 | + // CHECK-DAG: [[CMP2:%.+]] = arith.cmpi eq, %arg5, [[CST_1_i64]] : i64 |
| 49 | + // CHECK: [[VER_COND:%.+]] = arith.andi [[CMP1]], [[CMP2]] : i1 |
| 50 | + // CHECK: [[LOOP_VER:%.+]]:2 = scf.if [[VER_COND]] |
| 51 | + // CHECK: scf.for |
| 52 | + // CHECK: tt.advance [[NEW_PTR1]] |
| 53 | + // CHECK: tt.advance [[NEW_PTR2]] |
| 54 | + // CHECK: } else { |
| 55 | + // CHECK: scf.for |
| 56 | + // CHECK: tt.advance [[ORIG_PTR1]] |
| 57 | + // CHECK: tt.advance [[ORIG_PTR2]] |
| 58 | + // CHECK: } |
| 59 | +} |
| 60 | + |
| 61 | +// ----- |
| 62 | + |
| 63 | +module { |
| 64 | + tt.func public @do_not_version(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %arg2: i64, %arg3: i64, %arg4: i64 {tt.divisibility = 16 : i32}, %arg5: i64) { |
| 65 | + %c64_i32 = arith.constant 64 : i32 |
| 66 | + %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32> |
| 67 | + %c256_i32 = arith.constant 256 : i32 |
| 68 | + %c32_i32 = arith.constant 32 : i32 |
| 69 | + %c0_i32 = arith.constant 0 : i32 |
| 70 | + %c4096_i32 = arith.constant 4096 : i32 |
| 71 | + %c8192_i32 = arith.constant 8192 : i32 |
| 72 | + %c4_i32 = arith.constant 4 : i32 |
| 73 | + %c2_i64 = arith.constant 2 : i64 |
| 74 | + %c4_i64 = arith.constant 4 : i64 |
| 75 | + %0 = tt.get_program_id x : i32 |
| 76 | + %1 = arith.divsi %0, %c64_i32 : i32 |
| 77 | + %2 = arith.muli %1, %c4_i32 : i32 |
| 78 | + %3 = arith.subi %c32_i32, %2 : i32 |
| 79 | + %4 = arith.minsi %3, %c4_i32 : i32 |
| 80 | + %5 = arith.remsi %0, %c64_i32 : i32 |
| 81 | + %6 = arith.remsi %5, %4 : i32 |
| 82 | + %7 = arith.addi %2, %6 : i32 |
| 83 | + %8 = arith.divsi %5, %4 : i32 |
| 84 | + %9 = arith.extsi %c8192_i32 : i32 to i64 |
| 85 | + %10 = arith.extsi %c4096_i32 : i32 to i64 |
| 86 | + %11 = tt.make_tensor_ptr %arg0, [%9, %10], [%c4_i64, %c2_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<256x32xbf16>> |
| 87 | + %12 = tt.make_tensor_ptr %arg1, [%10, %10], [%c2_i64, %c4_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x256xbf16>> |
| 88 | + %13 = arith.muli %7, %c256_i32 : i32 |
| 89 | + %14 = arith.muli %8, %c256_i32 : i32 |
| 90 | + %15:2 = scf.for %arg9 = %c0_i32 to %c4096_i32 step %c32_i32 iter_args(%arg10 = %cst, %arg11 = %c0_i32) -> (tensor<256x256xf32>, i32) : i32 { |
| 91 | + %20 = tt.advance %11, [%13, %arg11] : <tensor<256x32xbf16>> |
| 92 | + %21 = tt.load %20 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xbf16>> |
| 93 | + %22 = tt.advance %12, [%arg11, %14] : <tensor<32x256xbf16>> |
| 94 | + %23 = tt.load %22 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xbf16>> |
| 95 | + %24 = tt.dot %21, %23, %cst, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32> |
| 96 | + %25 = arith.addf %arg10, %24 : tensor<256x256xf32> |
| 97 | + %26 = arith.addi %arg11, %c32_i32 : i32 |
| 98 | + scf.yield %25, %26 : tensor<256x256xf32>, i32 |
| 99 | + } |
| 100 | + tt.return |
| 101 | + } |
| 102 | + |
| 103 | + // CHECK: tt.func public @do_not_version |
| 104 | + // CHECK-DAG: [[PTR1:%.+]] = tt.make_tensor_ptr %arg0 |
| 105 | + // CHECK-DAG: [[PTR2:%.+]] = tt.make_tensor_ptr %arg1 |
| 106 | + // CHECK-NOT: scf.if |
| 107 | + // CHECK: scf.for |
| 108 | + // CHECK: tt.advance [[PTR1]] |
| 109 | + // CHECK: tt.advance [[PTR2]] |
| 110 | +} |
0 commit comments