@@ -326,8 +326,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
326326 %59 = tt.fp_to_fp %32 : tensor <64 x64 xf8 E5 M2 , #triton_gpu.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> -> tensor <64 x64 xf16 , #triton_gpu.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>>
327327 %60 = tt.fp_to_fp %58 : tensor <64 x32 xf8 E5 M2 , #triton_gpu.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <64 x32 xf16 , #triton_gpu.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>>
328328 %61 = tt.dot %59 , %60 , %cst_2 , inputPrecision = tf32 : tensor <64 x64 xf16 , #triton_gpu.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <64 x32 xf16 , #triton_gpu.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <64 x32 xf32 , #mma >
329- // CHECK-DAG: [[ADVANCE1:%.*]] = tt.advance [[ARG1]], {{.*}} : <tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
330- // CHECK-DAG: [[ADVANCE2:%.*]] = tt.advance [[ARG2]], {{.*}} : <tensor<32x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
329+ // CHECK-DAG: [[ADVANCE1:%.*]] = tt.advance [[ARG1]], {{.*}} : <tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
330+ // CHECK-DAG: [[ADVANCE2:%.*]] = tt.advance [[ARG2]], {{.*}} : <tensor<32x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
331331 // CHECK-NEXT: scf.yield [[ADVANCE1]], [[ADVANCE2]] : !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>, !tt.ptr<tensor<32x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>
332332 %84 = tt.advance %arg26 , [%c32_i32 , %c0_i32 ] : <tensor <32 x64 xf8 E5 M2 , #triton_gpu.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>>>
333333 %85 = tt.advance %arg25 , [%c0_i32 , %c32_i32 ] : <tensor <64 x32 xf8 E5 M2 , #triton_gpu.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>>>
0 commit comments