22
33
44#shared = #ttg.swizzled_shared <{vec = 8 , perPhase = 2 , maxPhase = 8 , order = [1 , 0 ]}>
5+ #padded = #ttg.padded_shared <[32 :+4 ] {order = [1 , 0 ], shape = [256 , 128 ]}>
56#smem = #ttg.shared_memory
67
78module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 8 : i32 , ttg.target = " hip:gfx942" , " ttg.threads-per-warp" = 64 : i32 } {
@@ -10,30 +11,20 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
1011 %c0_i32 = arith.constant 0 : i32
1112 %0 = ttg.local_alloc : () -> !ttg.memdesc <1 x256 x128 xf16 , #shared , #smem , mutable >
1213 %1 = ttg.memdesc_index %0 [%c0_i32 ] : !ttg.memdesc <1 x256 x128 xf16 , #shared , #smem , mutable > -> !ttg.memdesc <256 x128 xf16 , #shared , #smem , mutable >
13- %c0_i32_0 = arith.constant 0 : i32
14- %c0_i32_1 = arith.constant 0 : i32
1514 %2 = ttg.memdesc_subslice %1 [0 , 0 ] : !ttg.memdesc <256 x128 xf16 , #shared , #smem , mutable > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 256 x128 >
16- %c0_i32_2 = arith.constant 0 : i32
17- %c32_i32 = arith.constant 32 : i32
1815 %3 = ttg.memdesc_subslice %1 [0 , 32 ] : !ttg.memdesc <256 x128 xf16 , #shared , #smem , mutable > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 256 x128 >
19- %c0_i32_3 = arith.constant 0 : i32
20- %c64_i32 = arith.constant 64 : i32
2116 %4 = ttg.memdesc_subslice %1 [0 , 64 ] : !ttg.memdesc <256 x128 xf16 , #shared , #smem , mutable > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 256 x128 >
22- %c0_i32_4 = arith.constant 0 : i32
23- %c96_i32 = arith.constant 96 : i32
2417 %5 = ttg.memdesc_subslice %1 [0 , 96 ] : !ttg.memdesc <256 x128 xf16 , #shared , #smem , mutable > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 256 x128 >
25- %c128_i32 = arith.constant 128 : i32
26- %c0_i32_5 = arith.constant 0 : i32
2718 %6 = ttg.memdesc_subslice %1 [128 , 0 ] : !ttg.memdesc <256 x128 xf16 , #shared , #smem , mutable > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 256 x128 >
28- %c128_i32_6 = arith.constant 128 : i32
29- %c32_i32_7 = arith.constant 32 : i32
3019 %7 = ttg.memdesc_subslice %1 [128 , 32 ] : !ttg.memdesc <256 x128 xf16 , #shared , #smem , mutable > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 256 x128 >
31- %c128_i32_8 = arith.constant 128 : i32
32- %c64_i32_9 = arith.constant 64 : i32
3320 %8 = ttg.memdesc_subslice %1 [128 , 64 ] : !ttg.memdesc <256 x128 xf16 , #shared , #smem , mutable > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 256 x128 >
34- %c128_i32_10 = arith.constant 128 : i32
35- %c96_i32_11 = arith.constant 96 : i32
3621 %9 = ttg.memdesc_subslice %1 [128 , 96 ] : !ttg.memdesc <256 x128 xf16 , #shared , #smem , mutable > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 256 x128 >
22+
23+ %padded = ttg.local_alloc : () -> !ttg.memdesc <1 x256 x128 xf16 , #padded , #smem , mutable >
24+ %padded_indexed_explicit_alloc_shape = ttg.memdesc_index %padded [%c0_i32 ] : !ttg.memdesc <1 x256 x128 xf16 , #padded , #smem , mutable > -> !ttg.memdesc <256 x128 xf16 , #padded , #smem , mutable , 1 x256 x128 >
25+ %10 = ttg.memdesc_subslice %padded_indexed_explicit_alloc_shape [128 , 96 ] : !ttg.memdesc <256 x128 xf16 , #padded , #smem , mutable , 1 x256 x128 > -> !ttg.memdesc <128 x32 xf16 , #padded , #smem , mutable , 1 x256 x128 >
26+ %padded_indexed_implicit_alloc_shape = ttg.memdesc_index %padded [%c0_i32 ] : !ttg.memdesc <1 x256 x128 xf16 , #padded , #smem , mutable > -> !ttg.memdesc <256 x128 xf16 , #padded , #smem , mutable >
27+ %11 = ttg.memdesc_subslice %padded_indexed_implicit_alloc_shape [128 , 96 ] : !ttg.memdesc <256 x128 xf16 , #padded , #smem , mutable > -> !ttg.memdesc <128 x32 xf16 , #padded , #smem , mutable , 256 x128 >
3728 tt.return
3829 }
3930}
0 commit comments