@@ -40,6 +40,25 @@ tt.func @test_canonicalize_convert_expensive_view(%arg0: tensor<256x16xf32, #blo
4040
4141// -----
4242
43+ // test that the convert doesn't get combined with view if the resulting operations
44+ // is an expensive view which would require moving data across threads.
45+ // CHECK-LABEL: @test_canonicalize_convert_expensive_view
46+ // CHECK-SAME: (%[[ARG:.+]]: tensor<2xf32
47+ // CHECK: %[[C:.+]] = ttg.convert_layout %[[ARG]]
48+ // CHECK: %[[V:.+]] = tt.reshape %[[C]] allow_reorder
49+ // CHECK: tt.return %[[V]]
50+ #blocked = #ttg.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [8 , 4 ], warpsPerCTA = [4 , 1 ], order = [1 , 0 ]}>
51+ #blocked1 = #ttg.blocked <{sizePerThread = [1 ], threadsPerWarp = [32 ], warpsPerCTA = [4 ], order = [0 ]}>
52+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , ttg.target = " cuda:80" } {
53+ tt.func @test_canonicalize_convert_expensive_view2 (%arg0: tensor <2 xf32 , #ttg.slice <{dim = 1 , parent = #blocked }>>) -> tensor <2 xf32 , #blocked1 > {
54+ %c = ttg.convert_layout %arg0 : tensor <2 xf32 , #ttg.slice <{dim = 1 , parent = #blocked }>> -> tensor <2 xf32 , #blocked1 >
55+ %r = tt.reshape %c allow_reorder : tensor <2 xf32 , #blocked1 > -> tensor <2 xf32 , #blocked1 >
56+ tt.return %r : tensor <2 xf32 , #blocked1 >
57+ }
58+ }
59+
60+ // -----
61+
4362// test that the convert does get combined with the view even if the resulting operation
4463// is an efficient view.
4564// CHECK-LABEL: @test_canonicalize_convert_view
0 commit comments