@@ -83,20 +83,23 @@ tt.func @invalid_non_static_offset(%arg0: tensor<256x128xi32, #blocked1> {tt.div
8383
8484// Invalid layout 1
8585#dst_layout = #ttg.linear <{register =[[0 , 1 ], [0 , 2 ], [0 , 8 ], [0 , 16 ], [0 , 64 ], [64 , 0 ]], lane =[[1 , 0 ], [2 , 0 ], [4 , 0 ], [8 , 0 ], [16 , 0 ], [0 , 4 ]], warp =[[0 , 32 ], [32 , 0 ]], block =[]}>
86- #src_layout = #ttg.linear <{register =[[0 , 0 ], [ 0 , 1 ], [0 , 2 ], [0 , 8 ], [0 , 16 ], [0 , 64 ], [0 , 128 ], [64 , 0 ], [128 , 0 ]], lane =[[1 , 0 ], [2 , 0 ], [4 , 0 ], [8 , 0 ], [16 , 0 ], [0 , 4 ]], warp =[[0 , 32 ], [32 , 0 ]], block =[]}>
87- tt.func @invalid_register_base (%arg0: tensor <256 x256 xi32 , #src_layout > {tt.divisibility = 16 : i32 }) {
88- // expected-error @+1 {{Register basis must match on a CTA tile between source and destination}}
86+ #src_layout = #ttg.linear <{register =[[0 , 1 ], [0 , 2 ], [0 , 8 ], [0 , 16 ], [0 , 64 ], [0 , 128 ], [64 , 0 ], [128 , 0 ]], lane =[[1 , 0 ], [2 , 0 ], [4 , 0 ], [8 , 0 ], [16 , 0 ], [0 , 4 ], [ 0 , 0 ]], warp =[[0 , 32 ], [32 , 0 ]], block =[]}>
87+ tt.func @invalid_lane_warp_basis (%arg0: tensor <256 x256 xi32 , #src_layout > {tt.divisibility = 16 : i32 }) {
88+ // expected-error @+1 {{Lane and warp dim basis must match between source and destination layout }}
8989 %2 = amdgpu.extract_slice %arg0 [0 , 0 ] : tensor <256 x256 xi32 , #src_layout > to tensor <128 x128 xi32 , #dst_layout >
9090 tt.return
9191}
9292
9393// -----
9494
9595// Invalid layout 2
96- #dst_layout = #ttg.linear <{register =[[0 , 1 ], [0 , 2 ], [0 , 8 ], [0 , 16 ], [0 , 64 ], [64 , 0 ]], lane =[[1 , 0 ], [2 , 0 ], [4 , 0 ], [8 , 0 ], [16 , 0 ], [0 , 4 ]], warp =[[0 , 32 ], [32 , 0 ]], block =[]}>
97- #src_layout = #ttg.linear <{register =[[0 , 1 ], [0 , 2 ], [0 , 8 ], [0 , 16 ], [0 , 64 ], [0 , 128 ], [64 , 0 ], [128 , 0 ]], lane =[[1 , 0 ], [2 , 0 ], [4 , 0 ], [8 , 0 ], [16 , 0 ], [0 , 4 ], [0 , 0 ]], warp =[[0 , 32 ], [32 , 0 ]], block =[]}>
98- tt.func @invalid_lane_warp_basis (%arg0: tensor <256 x256 xi32 , #src_layout > {tt.divisibility = 16 : i32 }) {
99- // expected-error @+1 {{Lane and warp dim basis must match between source and destination layout}}
100- %2 = amdgpu.extract_slice %arg0 [0 , 0 ] : tensor <256 x256 xi32 , #src_layout > to tensor <128 x128 xi32 , #dst_layout >
101- tt.return
96+ // Case when src and dst layouts have same CTA tile shape, but different number of registers
97+ #src_layout = #ttg.linear <{register =[[1 , 0 ], [2 , 0 ]], lane =[[4 , 0 ], [8 , 0 ], [16 , 0 ], [0 , 1 ], [0 , 2 ], [0 , 4 ]], warp =[[0 , 0 ], [0 , 8 ]], block =[]}>
98+ #dst_layout = #ttg.linear <{register =[[1 , 0 ]], lane =[[4 , 0 ], [8 , 0 ], [16 , 0 ], [0 , 1 ], [0 , 2 ], [0 , 4 ]], warp =[[2 , 0 ], [0 , 8 ]], block =[]}>
99+ module attributes {" ttg.compute-capability" = 0 : i32 , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 64 : i32 } {
100+ tt.func @invalid_concat (%arg0: tensor <64 x32 xi32 , #src_layout >) {
101+ // expected-error @+1 {{Register basis must match on a CTA tile between source and destination.}}
102+ %1 = amdgpu.extract_slice %arg0 [0 , 0 ] : tensor <64 x32 xi32 , #src_layout > to tensor <32 x16 xi32 , #dst_layout >
103+ tt.return
104+ }
102105}
0 commit comments