|
1 | | -// RUN: triton-opt %s -split-input-file -tritonamdgpu-optimize-dot-operands="arch-generation-name=gfx950" | FileCheck %s --check-prefixes CHECK,GFX950 |
2 | | -// RUN: triton-opt %s -split-input-file -tritonamdgpu-optimize-dot-operands="arch-generation-name=gfx942" | FileCheck %s |
3 | | - |
4 | | -#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}> |
5 | | -#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [32, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 0], [0, 0]], block = []}> |
6 | | -#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}> |
7 | | -#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}> |
8 | | -// CHECK{LITERAL}: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}> |
9 | | -// CHECK{LITERAL}: #smem = #ttg.shared_memory |
10 | | -// CHECK-LABEL: test_local_load_transposed |
11 | | -// CHECK: %[[LOAD:.+]] = tt.load {{.*}} : tensor<64x16x!tt.ptr<f16>, #blocked> |
12 | | -// CHECK: %[[ALLOC:.+]] = ttg.local_alloc %[[LOAD]] : (tensor<64x16xf16, #blocked>) -> !ttg.memdesc<64x16xf16, #shared, #smem> |
13 | | -// CHECK: %[[LOCAL_LOAD_TRANS:.+]] = ttg.local_load %[[ALLOC]] : !ttg.memdesc<64x16xf16, #shared, #smem> -> tensor<64x16xf16, #linear> |
14 | | -// CHECK: %[[LOCAL_LOAD_DIRECT:.+]] = ttg.local_load %[[ALLOC]] : !ttg.memdesc<64x16xf16, #shared, #smem> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> |
15 | | -// CHECK: tt.dot {{.+}}, %[[LOCAL_LOAD_DIRECT]], {{.+}}: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x16xf32, #mma> |
16 | | -// CHECK: %[[TRANS:.+]] = tt.trans %[[LOCAL_LOAD_TRANS]] {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> |
17 | | -// CHECK: tt.dot {{.+}}, %[[TRANS]], {{.+}} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x64xf32, #mma1> |
18 | | -module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} { |
19 | | - tt.func public @test_local_load_transposed( |
20 | | - %arg0: tensor<64x16x!tt.ptr<f16>, #blocked>, |
21 | | - %out0 : tensor<128x16x!tt.ptr<f32>, #blocked>, |
22 | | - %out1 : tensor<128x64x!tt.ptr<f32>, #blocked> |
23 | | - ) { |
24 | | - %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> |
25 | | - %cst_1 = arith.constant dense<0.693147182> : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> |
26 | | - %cst_2 = arith.constant dense<0.581374812> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> |
27 | | - %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> |
28 | | - |
29 | | - %0 = tt.load %arg0 : tensor<64x16x!tt.ptr<f16>, #blocked> |
30 | | - %1 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #linear> |
31 | | - %2 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> |
32 | | - %3 = tt.dot %cst_1, %2, %cst_0 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x16xf32, #mma1> |
33 | | - %4 = tt.trans %1 {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> |
34 | | - %5 = tt.dot %cst_2, %4, %cst_3 : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x64xf32, #mma> |
35 | | - |
36 | | - %6 = ttg.convert_layout %3 : tensor<128x16xf32, #mma1> -> tensor<128x16xf32, #blocked> |
37 | | - %7 = ttg.convert_layout %5 : tensor<128x64xf32, #mma> -> tensor<128x64xf32, #blocked> |
38 | | - tt.store %out0, %6 : tensor<128x16x!tt.ptr<f32>, #blocked> |
39 | | - tt.store %out1, %7 : tensor<128x64x!tt.ptr<f32>, #blocked> |
40 | | - tt.return |
41 | | - } |
42 | | -} |
43 | | -// ----- |
44 | | - |
45 | | -#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}> |
46 | | -#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [32, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 0], [0, 0]], block = []}> |
47 | | -#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}> |
48 | | -#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}> |
49 | | -// CHECK-NOT: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}> |
50 | | -// CHECK-NOT: #smem = #ttg.shared_memory |
51 | | -// CHECK-LABEL: test_not_local_load_transposed_kWidth_mismatch |
52 | | -// CHECK: tt.load {{.*}} : tensor<64x16x!tt.ptr<f16>, #blocked> |
53 | | -// CHECK-NOT: ttg.local_alloc |
54 | | -// CHECK-NOT: ttg.local_load |
55 | | -// CHECK-NOT: ttg.local_load |
56 | | -// CHECK: tt.dot {{.+}}: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<128x16xf32, #mma> |
57 | | -// CHECK: tt.trans {{.+}} {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> |
58 | | -// CHECK: tt.dot {{.+}} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x64xf32, #mma1> |
59 | | -module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} { |
60 | | - tt.func public @test_not_local_load_transposed_kWidth_mismatch( |
61 | | - %arg0: tensor<64x16x!tt.ptr<f16>, #blocked>, |
62 | | - %out0 : tensor<128x16x!tt.ptr<f32>, #blocked>, |
63 | | - %out1 : tensor<128x64x!tt.ptr<f32>, #blocked> |
64 | | - ) { |
65 | | - %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> |
66 | | - %cst_1 = arith.constant dense<0.693147182> : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>> |
67 | | - %cst_2 = arith.constant dense<0.581374812> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> |
68 | | - %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> |
69 | | - |
70 | | - %0 = tt.load %arg0 : tensor<64x16x!tt.ptr<f16>, #blocked> |
71 | | - %1 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #linear> |
72 | | - %2 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>> |
73 | | - %3 = tt.dot %cst_1, %2, %cst_0 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>> -> tensor<128x16xf32, #mma1> |
74 | | - %4 = tt.trans %1 {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> |
75 | | - %5 = tt.dot %cst_2, %4, %cst_3 : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x64xf32, #mma> |
76 | | - |
77 | | - %6 = ttg.convert_layout %3 : tensor<128x16xf32, #mma1> -> tensor<128x16xf32, #blocked> |
78 | | - %7 = ttg.convert_layout %5 : tensor<128x64xf32, #mma> -> tensor<128x64xf32, #blocked> |
79 | | - tt.store %out0, %6 : tensor<128x16x!tt.ptr<f32>, #blocked> |
80 | | - tt.store %out1, %7 : tensor<128x64x!tt.ptr<f32>, #blocked> |
81 | | - tt.return |
82 | | - } |
83 | | -} |
84 | | -// ----- |
85 | | - |
86 | | -#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}> |
87 | | -#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [32, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 0], [0, 0]], block = []}> |
88 | | -#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}> |
89 | | -#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16, 32], isTransposed = true}> |
90 | | -// CHECK-NOT: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}> |
91 | | -// CHECK-NOT: #smem = #ttg.shared_memory |
92 | | -// CHECK-LABEL: test_not_local_load_transposed_opIdx_mismatch |
93 | | -// CHECK: tt.load {{.*}} : tensor<64x16x!tt.ptr<f16>, #blocked> |
94 | | -// CHECK-NOT: ttg.local_alloc |
95 | | -// CHECK-NOT: ttg.local_load |
96 | | -// CHECK-NOT: ttg.local_load |
97 | | -// CHECK: tt.dot {{.+}}: tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<64x64xf32, #mma> |
98 | | -// CHECK: tt.trans {{.+}} {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> |
99 | | -// CHECK: tt.dot {{.+}} : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<128x64xf32, #mma1> |
100 | | -module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} { |
101 | | - tt.func public @test_not_local_load_transposed_opIdx_mismatch( |
102 | | - %arg0: tensor<64x16x!tt.ptr<f16>, #blocked>, |
103 | | - %out0 : tensor<64x64x!tt.ptr<f32>, #blocked>, |
104 | | - %out1 : tensor<128x64x!tt.ptr<f32>, #blocked> |
105 | | - ) { |
106 | | - %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma1> |
107 | | - %cst_1 = arith.constant dense<0.693147182> : tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> |
108 | | - %cst_2 = arith.constant dense<0.581374812> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> |
109 | | - %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> |
110 | | - |
111 | | - %0 = tt.load %arg0 : tensor<64x16x!tt.ptr<f16>, #blocked> |
112 | | - %1 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #linear> |
113 | | - %2 = ttg.convert_layout %0 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> |
114 | | - %3 = tt.dot %2, %cst_1, %cst_0 : tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>> -> tensor<64x64xf32, #mma1> |
115 | | - %4 = tt.trans %1 {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> |
116 | | - %5 = tt.dot %cst_2, %4, %cst_3 : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x64xf32, #mma> |
117 | | - |
118 | | - %6 = ttg.convert_layout %3 : tensor<64x64xf32, #mma1> -> tensor<64x64xf32, #blocked> |
119 | | - %7 = ttg.convert_layout %5 : tensor<128x64xf32, #mma> -> tensor<128x64xf32, #blocked> |
120 | | - tt.store %out0, %6 : tensor<64x64x!tt.ptr<f32>, #blocked> |
121 | | - tt.store %out1, %7 : tensor<128x64x!tt.ptr<f32>, #blocked> |
122 | | - tt.return |
123 | | - } |
124 | | -} |
125 | | - |
126 | | -// ----- |
| 1 | +// RUN: triton-opt %s -split-input-file -tritonamdgpu-optimize-dot-operands="arch-generation-name=gfx950" | FileCheck %s --check-prefixes GFX950 |
127 | 2 |
|
128 | 3 | #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 4], warpsPerCTA = [2, 1], order = [1, 0]}> |
129 | 4 | #linear = #ttg.linear<{register = [[0, 0, 1], [0, 0, 2], [0, 0, 4], [1, 0, 0], [2, 0, 0], [0, 32, 0], [0, 64, 0]], lane = [[0, 1, 0], [0, 2, 0], [0, 4, 0], [0, 8, 0], [0, 0, 8], [0, 0, 16]], warp = [[0, 16, 0]], block = []}> |
|
0 commit comments