@@ -143,7 +143,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
143143 ttg.local_store %14 , %17 : tensor <128 x32 xf16 , #blocked1 > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 1 x128 x32 >
144144 %18 = ttg.memdesc_subview %11 [%c0_i32 , %c0_i32 , %c0_i32 ] : !ttg.memdesc <1 x32 x128 xf16 , #shared1 , #smem , mutable > -> !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x32 x128 >
145145 ttg.local_store %16 , %18 : tensor <32 x128 xf16 , #blocked > -> !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x32 x128 >
146- %19:6 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args (%arg6 = %4 , %arg7 = %9 , %arg8 = %cst_2 , %arg9 = %c0_i32 , %arg10 = %17 , %arg11 = %18 ) -> (tensor <128 x32 x!tt.ptr <f16 >, #blocked1 >, tensor <32 x128 x!tt.ptr <f16 >, #blocked >, tensor <128 x128 xf32 , #mma >, i32 , !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable >, !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable >) {
146+ %19:6 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args (%arg6 = %4 , %arg7 = %9 , %arg8 = %cst_2 , %arg9 = %c0_i32 , %arg10 = %17 , %arg11 = %18 ) -> (tensor <128 x32 x!tt.ptr <f16 >, #blocked1 >, tensor <32 x128 x!tt.ptr <f16 >, #blocked >, tensor <128 x128 xf32 , #mma >, i32 , !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 1 x 128 x 32 >, !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x 32 x 128 >) {
147147 %20 = arith.subi %arg1 , %arg2 : index
148148 %21 = arith.cmpi slt , %arg5 , %20 : index
149149 %22 = ttg.local_load %arg10 : !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 1 x128 x32 > -> tensor <128 x32 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>>
@@ -163,7 +163,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
163163 ttg.local_store %29 , %35 : tensor <128 x32 xf16 , #blocked1 > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 1 x128 x32 >
164164 %36 = ttg.memdesc_subview %11 [%34 , %c0_i32 , %c0_i32 ] : !ttg.memdesc <1 x32 x128 xf16 , #shared1 , #smem , mutable > -> !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x32 x128 >
165165 ttg.local_store %31 , %36 : tensor <32 x128 xf16 , #blocked > -> !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x32 x128 >
166- scf.yield %26 , %27 , %25 , %34 , %35 , %36 : tensor <128 x32 x!tt.ptr <f16 >, #blocked1 >, tensor <32 x128 x!tt.ptr <f16 >, #blocked >, tensor <128 x128 xf32 , #mma >, i32 , !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable >, !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable >
166+ scf.yield %26 , %27 , %25 , %34 , %35 , %36 : tensor <128 x32 x!tt.ptr <f16 >, #blocked1 >, tensor <32 x128 x!tt.ptr <f16 >, #blocked >, tensor <128 x128 xf32 , #mma >, i32 , !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 1 x 128 x 32 >, !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x 32 x 128 >
167167 }
168168 ttg.local_dealloc %10 : !ttg.memdesc <1 x128 x32 xf16 , #shared , #smem , mutable >
169169 ttg.local_dealloc %11 : !ttg.memdesc <1 x32 x128 xf16 , #shared1 , #smem , mutable >
@@ -242,7 +242,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
242242 ttg.local_store %14 , %25 : tensor <128 x32 xf16 , #blocked1 > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 1 x128 x32 >
243243 %26 = ttg.memdesc_subview %11 [%c0_i32 , %c0_i32 , %c0_i32 ] : !ttg.memdesc <2 x32 x128 xf16 , #shared1 , #smem , mutable > -> !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x32 x128 >
244244 ttg.local_store %16 , %26 : tensor <32 x128 xf16 , #blocked > -> !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x32 x128 >
245- %27:8 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args (%arg6 = %19 , %arg7 = %20 , %arg8 = %cst_2 , %arg9 = %c0_i32 , %arg10 = %25 , %arg11 = %26 , %arg12 = %22 , %arg13 = %24 ) -> (tensor <128 x32 x!tt.ptr <f16 >, #blocked1 >, tensor <32 x128 x!tt.ptr <f16 >, #blocked >, tensor <128 x128 xf32 , #mma >, i32 , !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable >, !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable >, tensor <128 x32 xf16 , #blocked1 >, tensor <32 x128 xf16 , #blocked >) {
245+ %27:8 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args (%arg6 = %19 , %arg7 = %20 , %arg8 = %cst_2 , %arg9 = %c0_i32 , %arg10 = %25 , %arg11 = %26 , %arg12 = %22 , %arg13 = %24 ) -> (tensor <128 x32 x!tt.ptr <f16 >, #blocked1 >, tensor <32 x128 x!tt.ptr <f16 >, #blocked >, tensor <128 x128 xf32 , #mma >, i32 , !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 1 x 128 x 32 >, !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x 32 x 128 >, tensor <128 x32 xf16 , #blocked1 >, tensor <32 x128 xf16 , #blocked >) {
246246 %28 = arith.muli %arg2 , %c2 : index
247247 %29 = arith.subi %arg1 , %28 : index
248248 %30 = arith.cmpi slt , %arg5 , %29 : index
@@ -263,7 +263,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
263263 ttg.local_store %arg12 , %44 : tensor <128 x32 xf16 , #blocked1 > -> !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 1 x128 x32 >
264264 %45 = ttg.memdesc_subview %11 [%43 , %c0_i32 , %c0_i32 ] : !ttg.memdesc <2 x32 x128 xf16 , #shared1 , #smem , mutable > -> !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x32 x128 >
265265 ttg.local_store %arg13 , %45 : tensor <32 x128 xf16 , #blocked > -> !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x32 x128 >
266- scf.yield %35 , %36 , %34 , %43 , %44 , %45 , %38 , %40 : tensor <128 x32 x!tt.ptr <f16 >, #blocked1 >, tensor <32 x128 x!tt.ptr <f16 >, #blocked >, tensor <128 x128 xf32 , #mma >, i32 , !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable >, !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable >, tensor <128 x32 xf16 , #blocked1 >, tensor <32 x128 xf16 , #blocked >
266+ scf.yield %35 , %36 , %34 , %43 , %44 , %45 , %38 , %40 : tensor <128 x32 x!tt.ptr <f16 >, #blocked1 >, tensor <32 x128 x!tt.ptr <f16 >, #blocked >, tensor <128 x128 xf32 , #mma >, i32 , !ttg.memdesc <128 x32 xf16 , #shared , #smem , mutable , 1 x 128 x 32 >, !ttg.memdesc <32 x128 xf16 , #shared1 , #smem , mutable , 1 x 32 x 128 >, tensor <128 x32 xf16 , #blocked1 >, tensor <32 x128 xf16 , #blocked >
267267 }
268268 ttg.local_dealloc %10 : !ttg.memdesc <2 x128 x32 xf16 , #shared , #smem , mutable >
269269 ttg.local_dealloc %11 : !ttg.memdesc <2 x32 x128 xf16 , #shared1 , #smem , mutable >
@@ -335,7 +335,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
335335 ttg.local_store %8 , %17 : tensor <16 x16 xf16 , #blocked1 > -> !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable , 1 x16 x16 >
336336 %18 = ttg.memdesc_subview %1 [%c0_i32 , %c0_i32 , %c0_i32 ] : !ttg.memdesc <1 x16 x16 xf16 , #shared2 , #smem , mutable > -> !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable , 1 x16 x16 >
337337 ttg.local_store %14 , %18 : tensor <16 x16 xf16 , #blocked > -> !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable , 1 x16 x16 >
338- %19:7 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args (%arg7 = %cst , %arg8 = %arg2 , %arg9 = %6 , %arg10 = %c0_i32 , %arg11 = %17 , %arg12 = %18 , %arg13 = %16 ) -> (tensor <16 x16 xf32 , #mma >, tensor <16 x16 x!tt.ptr <f16 >, #blocked1 >, tensor <16 x!tt.ptr <i64 >, #ttg.slice <{dim = 1 , parent = #blocked }>>, i32 , !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable >, !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable >, tensor <16 xi64 , #ttg.slice <{dim = 1 , parent = #blocked }>>) {
338+ %19:7 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args (%arg7 = %cst , %arg8 = %arg2 , %arg9 = %6 , %arg10 = %c0_i32 , %arg11 = %17 , %arg12 = %18 , %arg13 = %16 ) -> (tensor <16 x16 xf32 , #mma >, tensor <16 x16 x!tt.ptr <f16 >, #blocked1 >, tensor <16 x!tt.ptr <i64 >, #ttg.slice <{dim = 1 , parent = #blocked }>>, i32 , !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable , 1 x 16 x 16 >, !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable , 1 x 16 x 16 >, tensor <16 xi64 , #ttg.slice <{dim = 1 , parent = #blocked }>>) {
339339 %20 = arith.subi %arg1 , %c2 : index
340340 %21 = arith.cmpi slt , %arg6 , %20 : index
341341 %22 = arith.subi %arg1 , %c1 : index
@@ -362,7 +362,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
362362 ttg.local_store %30 , %42 : tensor <16 x16 xf16 , #blocked1 > -> !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable , 1 x16 x16 >
363363 %43 = ttg.memdesc_subview %1 [%41 , %c0_i32 , %c0_i32 ] : !ttg.memdesc <1 x16 x16 xf16 , #shared2 , #smem , mutable > -> !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable , 1 x16 x16 >
364364 ttg.local_store %36 , %43 : tensor <16 x16 xf16 , #blocked > -> !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable , 1 x16 x16 >
365- scf.yield %26 , %27 , %28 , %41 , %42 , %43 , %38 : tensor <16 x16 xf32 , #mma >, tensor <16 x16 x!tt.ptr <f16 >, #blocked1 >, tensor <16 x!tt.ptr <i64 >, #ttg.slice <{dim = 1 , parent = #blocked }>>, i32 , !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable >, !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable >, tensor <16 xi64 , #ttg.slice <{dim = 1 , parent = #blocked }>>
365+ scf.yield %26 , %27 , %28 , %41 , %42 , %43 , %38 : tensor <16 x16 xf32 , #mma >, tensor <16 x16 x!tt.ptr <f16 >, #blocked1 >, tensor <16 x!tt.ptr <i64 >, #ttg.slice <{dim = 1 , parent = #blocked }>>, i32 , !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable , 1 x 16 x 16 >, !ttg.memdesc <16 x16 xf16 , #shared2 , #smem , mutable , 1 x 16 x 16 >, tensor <16 xi64 , #ttg.slice <{dim = 1 , parent = #blocked }>>
366366 }
367367 ttg.local_dealloc %0 : !ttg.memdesc <1 x16 x16 xf16 , #shared2 , #smem , mutable >
368368 ttg.local_dealloc %1 : !ttg.memdesc <1 x16 x16 xf16 , #shared2 , #smem , mutable >
0 commit comments