@@ -596,28 +596,6 @@ module attributes {"ttg.target" = "cuda:80", "ttg.num-ctas" = 1 : i32, "ttg.num-
596596 }
597597}
598598
599- // -----
600-
601-
602- // CHECK-DAG: #[[$BLOCK0:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}>
603- // CHECK-DAG: #[[$BLOCK1:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [0, 1]}>
604- // CHECK-LABEL: optimize_view_layout_same_shape
605- // CHECK: %[[R:.+]] = tt.reshape {{.*}} allow_reorder efficient_layout : tensor<64x16xf32, #[[$BLOCK0]]> -> tensor<64x16xf32, #[[$BLOCK1]]>
606- // CHECK: %[[C:.+]] = ttg.convert_layout %[[R]] : tensor<64x16xf32, #[[$BLOCK1]]> -> tensor<64x16xf32, #[[$BLOCK0]]>
607- // CHECK: "tt.reduce"(%[[C]])
608- #blocked = #ttg.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [2 , 1 ], order = [1 , 0 ]}>
609- module attributes {" ttg.target" = " cuda:80" , " ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 2 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
610- tt.func public @optimize_view_layout_same_shape (%arg0: tensor <64 x16 xf32 , #blocked >) -> tensor <64 xf32 , #ttg.slice <{dim = 1 , parent = #blocked }>> {
611- %0 = tt.reshape %arg0 allow_reorder : tensor <64 x16 xf32 , #blocked > -> tensor <64 x16 xf32 , #blocked >
612- %1 = " tt.reduce" (%0 ) <{axis = 1 : i32 }> ({
613- ^bb0 (%arg1: f32 , %arg2: f32 ):
614- %2 = arith.maximumf %arg1 , %arg2 : f32
615- tt.reduce.return %2 : f32
616- }) : (tensor <64 x16 xf32 , #blocked >) -> tensor <64 xf32 , #ttg.slice <{dim = 1 , parent = #blocked }>>
617- tt.return %1 : tensor <64 xf32 , #ttg.slice <{dim = 1 , parent = #blocked }>>
618- }
619- }
620-
621599// -----
622600#blocked = #ttg.blocked <{sizePerThread = [8 , 1 ], threadsPerWarp = [32 , 1 ], warpsPerCTA = [1 , 1 ], order = [1 , 0 ]}>
623601#blocked1 = #ttg.blocked <{sizePerThread = [8 ], threadsPerWarp = [32 ], warpsPerCTA = [1 ], order = [0 ]}>
0 commit comments