|
1 | | -// RUN: triton-opt %s --split-input-file --decompose-unsupported-amd-conversions=arch=gfx1130 | FileCheck %s |
| 1 | +// RUN: triton-opt %s --split-input-file --decompose-unsupported-amd-conversions | FileCheck %s |
2 | 2 |
|
3 | | -// CHECK: #[[BLOCKED:.+]] = #triton_gpu.blocked<{{.*}}> |
4 | | -// CHECK: #[[WMMA:.+]] = #triton_gpu.amd_wmma<{{.*}}> |
5 | | -// CHECK: #[[SHARED:.+]] = #triton_gpu.shared<{{.*}}> |
6 | | -// CHECK: wmma_to_wmma_dot_op |
| 3 | +// CHECK: #[[$BLOCKED:.+]] = #triton_gpu.blocked<{{.*}}> |
| 4 | +// CHECK: #[[$WMMA:.+]] = #triton_gpu.amd_wmma<{{.*}}> |
| 5 | +// CHECK: #[[$SHARED:.+]] = #triton_gpu.shared<{{.*}}> |
| 6 | +// CHECK-LABEL: wmma_to_wmma_dot_op |
7 | 7 | #mma = #triton_gpu.amd_wmma<{version = 1, warpsPerCTA = [2, 2]}> |
8 | | -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { |
| 8 | +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx1130", "triton_gpu.threads-per-warp" = 32 : i32} { |
9 | 9 | tt.func @wmma_to_wmma_dot_op(%arg0: tensor<16x16xf16, #mma>) { |
10 | | - // CHECK: %[[SRC_BLOCKED:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<16x16xf16, #[[WMMA]]> -> tensor<16x16xf16, #[[BLOCKED]]> |
11 | | - // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !tt.memdesc<16x16xf16, #[[SHARED]], #triton_gpu.shared_memory> |
12 | | - // CHECK-NEXT: %[[DST_DOT_OP:.+]] = triton_gpu.local_load %[[INT_SHARED]] : {{.*}} -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[WMMA]], kWidth = 16}>> |
| 10 | + // CHECK: %[[SRC_BLOCKED:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<16x16xf16, #[[$WMMA]]> -> tensor<16x16xf16, #[[$BLOCKED]]> |
| 11 | + // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !tt.memdesc<16x16xf16, #[[$SHARED]], #triton_gpu.shared_memory> |
| 12 | + // CHECK-NEXT: %[[DST_DOT_OP:.+]] = triton_gpu.local_load %[[INT_SHARED]] : {{.*}} -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[$WMMA]], kWidth = 16}>> |
13 | 13 | %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> |
14 | 14 | tt.return |
15 | 15 | } |
16 | 16 | } |
17 | 17 |
|
18 | 18 | // ----- |
19 | 19 |
|
20 | | -// CHECK: #[[BLOCKED:.+]] = #triton_gpu.blocked<{{.*}}> |
21 | | -// CHECK: #[[WMMA:.+]] = #triton_gpu.amd_wmma<{{.*}}> |
22 | | -// CHECK: #[[SHARED:.+]] = #triton_gpu.shared<{{.*}}> |
23 | | -// CHECK: wmma_to_wmma_dot3d_op |
| 20 | +// CHECK: #[[$BLOCKED:.+]] = #triton_gpu.blocked<{{.*}}> |
| 21 | +// CHECK: #[[$WMMA:.+]] = #triton_gpu.amd_wmma<{{.*}}> |
| 22 | +// CHECK: #[[$SHARED:.+]] = #triton_gpu.shared<{{.*}}> |
| 23 | +// CHECK-LABEL: wmma_to_wmma_dot3d_op |
24 | 24 | #mma = #triton_gpu.amd_wmma<{version = 1, warpsPerCTA = [2, 2, 2]}> |
25 | 25 | module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { |
26 | 26 | tt.func @wmma_to_wmma_dot3d_op(%arg0: tensor<2x16x16xf16, #mma>) { |
27 | | - // CHECK: %[[SRC_BLOCKED:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<2x16x16xf16, #[[WMMA]]> -> tensor<2x16x16xf16, #[[BLOCKED]]> |
28 | | - // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !tt.memdesc<2x16x16xf16, #[[SHARED]], #triton_gpu.shared_memory> |
29 | | - // CHECK-NEXT: %[[DST_DOT_OP:.+]] = triton_gpu.local_load %[[INT_SHARED]] : {{.*}} -> tensor<2x16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[WMMA]], kWidth = 16}>> |
| 27 | + // CHECK: %[[SRC_BLOCKED:.+]] = triton_gpu.convert_layout %{{.*}} : tensor<2x16x16xf16, #[[$WMMA]]> -> tensor<2x16x16xf16, #[[$BLOCKED]]> |
| 28 | + // CHECK-NEXT: %[[INT_SHARED:.+]] = triton_gpu.local_alloc %[[SRC_BLOCKED]] : {{.*}} -> !tt.memdesc<2x16x16xf16, #[[$SHARED]], #triton_gpu.shared_memory> |
| 29 | + // CHECK-NEXT: %[[DST_DOT_OP:.+]] = triton_gpu.local_load %[[INT_SHARED]] : {{.*}} -> tensor<2x16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[$WMMA]], kWidth = 16}>> |
30 | 30 | %0 = triton_gpu.convert_layout %arg0 : tensor<2x16x16xf16, #mma> -> tensor<2x16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> |
31 | 31 | tt.return |
32 | 32 | } |
33 | 33 | } |
| 34 | + |
| 35 | +// ----- |
| 36 | + |
| 37 | +// CHECK-LABEL: blocked_to_dot_op_shortcut_gfx1130 |
| 38 | +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [1, 0]}> |
| 39 | +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx1130", "triton_gpu.threads-per-warp" = 32 : i32} { |
| 40 | + tt.func @blocked_to_dot_op_shortcut_gfx1130(%arg0: tensor<32x32xf16, #blocked>) { |
| 41 | + // CHECK-NOT: triton_gpu.local_alloc |
| 42 | + // CHECK: triton_gpu.convert_layout |
| 43 | + // CHECK-NOT: triton_gpu.local_alloc |
| 44 | + %0 = triton_gpu.convert_layout %arg0 : tensor<32x32xf16, #blocked> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> |
| 45 | + tt.return |
| 46 | + } |
| 47 | +} |
| 48 | + |
| 49 | +// ----- |
| 50 | + |
| 51 | +// CHECK-LABEL: blocked_to_dot_op_shortcut_gfx940 |
| 52 | +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [32, 2], warpsPerCTA = [2, 2], order = [1, 0]}> |
| 53 | +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx940", "triton_gpu.threads-per-warp" = 64 : i32} { |
| 54 | + tt.func @blocked_to_dot_op_shortcut_gfx940(%arg0: tensor<32x32xf16, #blocked>) { |
| 55 | + // CHECK-NOT: triton_gpu.local_alloc |
| 56 | + // CHECK: triton_gpu.convert_layout |
| 57 | + // CHECK-NOT: triton_gpu.local_alloc |
| 58 | + %0 = triton_gpu.convert_layout %arg0 : tensor<32x32xf16, #blocked> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> |
| 59 | + tt.return |
| 60 | + } |
| 61 | +} |
| 62 | + |
| 63 | +// ----- |
| 64 | + |
| 65 | +// CHECK-LABEL: neg_blocked_to_dot_op_incompatible_elems_gfx940 |
| 66 | +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [32, 2], warpsPerCTA = [2, 2], order = [1, 0]}> |
| 67 | +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx940", "triton_gpu.threads-per-warp" = 64 : i32} { |
| 68 | + tt.func @neg_blocked_to_dot_op_incompatible_elems_gfx940(%arg0: tensor<32x32xf16, #blocked>) { |
| 69 | + // CHECK-NOT: triton_gpu.convert_layout |
| 70 | + // CHECK: triton_gpu.local_alloc |
| 71 | + // CHECK: triton_gpu.local_load |
| 72 | + %0 = triton_gpu.convert_layout %arg0 : tensor<32x32xf16, #blocked> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> |
| 73 | + tt.return |
| 74 | + } |
| 75 | +} |
| 76 | + |
| 77 | +// ----- |
| 78 | + |
| 79 | +// CHECK-LABEL: neg_blocked_to_dot_op_incompatible_threads_gfx940 |
| 80 | +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [32, 2], warpsPerCTA = [2, 2], order = [1, 0]}> |
| 81 | +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [16, 4], warpsPerCTA = [2, 2], order = [1, 0]}> |
| 82 | +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx940", "triton_gpu.threads-per-warp" = 64 : i32} { |
| 83 | + tt.func @neg_blocked_to_dot_op_incompatible_threads_gfx940(%arg0: tensor<32x32xf16, #blocked>) { |
| 84 | + // CHECK-NOT: triton_gpu.convert_layout |
| 85 | + // CHECK: triton_gpu.local_alloc |
| 86 | + // CHECK: triton_gpu.local_load |
| 87 | + %0 = triton_gpu.convert_layout %arg0 : tensor<32x32xf16, #blocked> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> |
| 88 | + tt.return |
| 89 | + } |
| 90 | +} |
| 91 | + |
| 92 | +// ----- |
| 93 | + |
| 94 | +// CHECK-LABEL: neg_blocked_to_dot_op_incompatible_warp_gfx940 |
| 95 | +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [32, 2], warpsPerCTA = [2, 2], order = [1, 0]}> |
| 96 | +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}> |
| 97 | +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx940", "triton_gpu.threads-per-warp" = 64 : i32} { |
| 98 | + tt.func @neg_blocked_to_dot_op_incompatible_warp_gfx940(%arg0: tensor<32x32xf16, #blocked>) { |
| 99 | + // CHECK-NOT: triton_gpu.convert_layout |
| 100 | + // CHECK: triton_gpu.local_alloc |
| 101 | + // CHECK: triton_gpu.local_load |
| 102 | + %0 = triton_gpu.convert_layout %arg0 : tensor<32x32xf16, #blocked> -> tensor<32x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> |
| 103 | + tt.return |
| 104 | + } |
| 105 | +} |
0 commit comments