3
3
// Invalid size
4
4
#blocked1 = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [4 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [0 , 1 ]}>
5
5
tt.func @invalid_size_input (%arg0: tensor <256 x128 xi32 , #blocked1 > {tt.divisibility = 16 : i32 }) {
6
- // expected-error @+1 {{sizes [256, 2] must be a multiple of shapePerCTATile [256, 16] }}
6
+ // expected-error @+1 {{result shape must be multiple of shapePerCTATile}}
7
7
%1 = amdgpu.extract_slice %arg0 [0 ,0 ] : tensor <256 x128 xi32 , #blocked1 > to tensor <256 x2 xi32 , #blocked1 >
8
8
tt.return
9
9
}
10
10
11
11
// -----
12
12
13
- // Invalid zero source dimension
14
- #blocked1 = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [4 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [0 , 1 ]}>
15
- tt.func @invalid_size_input (%arg0: tensor <256 x0 xi32 , #blocked1 > {tt.divisibility = 16 : i32 }) {
16
- // expected-error @+1 {{source tensor dimension size zero at dimension 1}}
17
- %1 = amdgpu.extract_slice %arg0 [0 ,0 ] : tensor <256 x0 xi32 , #blocked1 > to tensor <256 x16 xi32 , #blocked1 >
18
- tt.return
19
- }
20
-
21
- // -----
22
-
23
- // Invalid zero result dimension
24
- #blocked1 = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [4 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [0 , 1 ]}>
25
- tt.func @invalid_size_input (%arg0: tensor <256 x128 xi32 , #blocked1 > {tt.divisibility = 16 : i32 }) {
26
- // expected-error @+1 {{result tensor dimension size zero at dimension 1}}
27
- %1 = amdgpu.extract_slice %arg0 [0 ,0 ] : tensor <256 x128 xi32 , #blocked1 > to tensor <256 x0 xi32 , #blocked1 >
28
- tt.return
29
- }
30
-
31
- // -----
32
-
33
13
// Invalid offset, not multiple of shapePerTile
34
14
#blocked1 = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [4 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [0 , 1 ]}>
35
15
tt.func @invalid_offset_input (%arg0: tensor <256 x128 xi32 , #blocked1 > {tt.divisibility = 16 : i32 }) {
36
- // expected-error @+1 {{offset [0, 5] must be a multiple of shapePerCTATile [256, 16] }}
16
+ // expected-error @+1 {{offset must be multiple of shapePerCTATile}}
37
17
%1 = amdgpu.extract_slice %arg0 [0 ,5 ] : tensor <256 x128 xi32 , #blocked1 > to tensor <256 x16 xi32 , #blocked1 >
38
18
tt.return
39
19
}
@@ -43,7 +23,7 @@ tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibi
43
23
// Invalid offset, out of bounds for dimension
44
24
#blocked1 = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [4 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [0 , 1 ]}>
45
25
tt.func @invalid_offset_input (%arg0: tensor <256 x128 xi32 , #blocked1 > {tt.divisibility = 16 : i32 }) {
46
- // expected-error @+1 {{invalid offset 128 at dimension 1}}
26
+ // expected-error @+1 {{invalid offset at dimension 1}}
47
27
%1 = amdgpu.extract_slice %arg0 [0 ,128 ] : tensor <256 x128 xi32 , #blocked1 > to tensor <256 x16 xi32 , #blocked1 >
48
28
tt.return
49
29
}
@@ -54,11 +34,10 @@ tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibi
54
34
#blocked1 = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [4 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [0 , 1 ]}>
55
35
#blocked2 = #ttg.blocked <{sizePerThread = [4 , 1 ], threadsPerWarp = [4 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [0 , 1 ]}>
56
36
tt.func @invalid_result_layout (%arg0: tensor <256 x128 xi32 , #blocked1 > {tt.divisibility = 16 : i32 }) {
57
- // expected-error @+1 {{result layout must match source layout }}
37
+ // expected-error @+1 {{CTA tile shapes must match between source and destination tensors. }}
58
38
%1 = amdgpu.extract_slice %arg0 [0 ,0 ] : tensor <256 x128 xi32 , #blocked1 > to tensor <256 x16 xi32 , #blocked2 >
59
39
tt.return
60
40
}
61
-
62
41
// -----
63
42
64
43
// Invalid result element type
@@ -84,23 +63,13 @@ tt.func @invalid_result_rank(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibil
84
63
// Invalid result shape
85
64
#blocked1 = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [4 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [0 , 1 ]}>
86
65
tt.func @invalid_result_rank (%arg0: tensor <256 x128 xi32 , #blocked1 > {tt.divisibility = 16 : i32 }) {
87
- // expected-error @+1 {{result shape cannot be larger than input shape at dimension 1}}
66
+ // expected-error @+1 {{result shape cannot exceed source shape at dimension 1}}
88
67
%1 = amdgpu.extract_slice %arg0 [0 ,0 ] : tensor <256 x128 xi32 , #blocked1 > to tensor <256 x256 xi32 , #blocked1 >
89
68
tt.return
90
69
}
91
70
92
71
// -----
93
72
94
- // Invalid rank
95
- #blocked1 = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [4 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [0 , 1 ]}>
96
- tt.func @invalid_rank (%arg0: tensor <256 x128 x2 xi32 , #blocked1 > {tt.divisibility = 16 : i32 }) {
97
- // expected-error @+1 {{currently only 2D tensors are supported}}
98
- %1 = amdgpu.extract_slice %arg0 [0 ,0 ,0 ] : tensor <256 x128 x2 xi32 , #blocked1 > to tensor <256 x16 x2 xi32 , #blocked1 >
99
- tt.return
100
- }
101
-
102
- // -----
103
-
104
73
// Invalid non static offset
105
74
#blocked1 = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [4 , 16 ], warpsPerCTA = [8 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [0 , 1 ]}>
106
75
tt.func @invalid_non_static_offset (%arg0: tensor <256 x128 xi32 , #blocked1 > {tt.divisibility = 16 : i32 }, %arg1: i32 ) {
@@ -109,3 +78,25 @@ tt.func @invalid_non_static_offset(%arg0: tensor<256x128xi32, #blocked1> {tt.div
109
78
%2 = amdgpu.extract_slice %arg0 [%arg1 , 0 ] : tensor <256 x128 xi32 , #blocked1 > to tensor <256 x16 xi32 , #blocked1 >
110
79
tt.return
111
80
}
81
+
82
+ // -----
83
+
84
+ // Invalid layout 1
85
+ #dst_layout = #ttg.linear <{register =[[0 , 1 ], [0 , 2 ], [0 , 8 ], [0 , 16 ], [0 , 64 ], [64 , 0 ]], lane =[[1 , 0 ], [2 , 0 ], [4 , 0 ], [8 , 0 ], [16 , 0 ], [0 , 4 ]], warp =[[0 , 32 ], [32 , 0 ]], block =[]}>
86
+ #src_layout = #ttg.linear <{register =[[0 , 0 ], [0 , 1 ], [0 , 2 ], [0 , 8 ], [0 , 16 ], [0 , 64 ], [0 , 128 ], [64 , 0 ], [128 , 0 ]], lane =[[1 , 0 ], [2 , 0 ], [4 , 0 ], [8 , 0 ], [16 , 0 ], [0 , 4 ]], warp =[[0 , 32 ], [32 , 0 ]], block =[]}>
87
+ tt.func @invalid_register_base (%arg0: tensor <256 x256 xi32 , #src_layout > {tt.divisibility = 16 : i32 }) {
88
+ // expected-error @+1 {{Register basis must match on a CTA tile between source and destination}}
89
+ %2 = amdgpu.extract_slice %arg0 [0 , 0 ] : tensor <256 x256 xi32 , #src_layout > to tensor <128 x128 xi32 , #dst_layout >
90
+ tt.return
91
+ }
92
+
93
+ // -----
94
+
95
+ // Invalid layout 2
96
+ #dst_layout = #ttg.linear <{register =[[0 , 1 ], [0 , 2 ], [0 , 8 ], [0 , 16 ], [0 , 64 ], [64 , 0 ]], lane =[[1 , 0 ], [2 , 0 ], [4 , 0 ], [8 , 0 ], [16 , 0 ], [0 , 4 ]], warp =[[0 , 32 ], [32 , 0 ]], block =[]}>
97
+ #src_layout = #ttg.linear <{register =[[0 , 1 ], [0 , 2 ], [0 , 8 ], [0 , 16 ], [0 , 64 ], [0 , 128 ], [64 , 0 ], [128 , 0 ]], lane =[[1 , 0 ], [2 , 0 ], [4 , 0 ], [8 , 0 ], [16 , 0 ], [0 , 4 ], [0 , 0 ]], warp =[[0 , 32 ], [32 , 0 ]], block =[]}>
98
+ tt.func @invalid_lane_warp_basis (%arg0: tensor <256 x256 xi32 , #src_layout > {tt.divisibility = 16 : i32 }) {
99
+ // expected-error @+1 {{Lane and warp dim basis must match between source and destination layout}}
100
+ %2 = amdgpu.extract_slice %arg0 [0 , 0 ] : tensor <256 x256 xi32 , #src_layout > to tensor <128 x128 xi32 , #dst_layout >
101
+ tt.return
102
+ }
0 commit comments