@@ -2776,7 +2776,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32}
27762776 %cst_1 = arith.constant dense <1.230000e+02 > : tensor <32 x32 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>>
27772777 %cst_2 = arith.constant dense <1.230000e+02 > : tensor <32 x32 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma1 , kWidth = 4 }>>
27782778 %cst_3 = arith.constant dense <1.230000e+02 > : tensor <32 x16 xf32 , #mma1 >
2779- %0 = tt.dot %cst_0 , %cst_1 , %cst : tensor <32 x32 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <32 x32 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <32 x32 xf32 , #mma >
2779+ %0 = tt.dot %cst_0 , %cst_1 , %cst , inputPrecision = tf32 : tensor <32 x32 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <32 x32 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <32 x32 xf32 , #mma >
27802780 %1 = ttg.convert_layout %0 : tensor <32 x32 xf32 , #mma > -> tensor <32 x32 xf32 , #blocked >
27812781 %2 = " tt.reduce" (%1 ) ({
27822782 ^bb0 (%arg1: f32 , %arg2: f32 ):
@@ -2786,7 +2786,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32}
27862786 %4 = tt.expand_dims %2 {axis = 1 : i32 } : tensor <32 xf32 , #ttg.slice <{dim = 1 , parent = #blocked }>> -> tensor <32 x1 xf32 , #blocked >
27872787 %5 = tt.broadcast %4 : tensor <32 x1 xf32 , #blocked > -> tensor <32 x16 xf32 , #blocked >
27882788 %6 = ttg.convert_layout %5 : tensor <32 x16 xf32 , #blocked > -> tensor <32 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma1 , kWidth = 4 }>>
2789- %7 = tt.dot %cst_2 , %6 , %cst_3 : tensor <32 x32 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma1 , kWidth = 4 }>> * tensor <32 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma1 , kWidth = 4 }>> -> tensor <32 x16 xf32 , #mma1 >
2789+ %7 = tt.dot %cst_2 , %6 , %cst_3 , inputPrecision = tf32 : tensor <32 x32 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma1 , kWidth = 4 }>> * tensor <32 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma1 , kWidth = 4 }>> -> tensor <32 x16 xf32 , #mma1 >
27902790 %addr = tt.splat %arg0 : !tt.ptr <f32 > -> tensor <32 x16 x!tt.ptr <f32 >, #blocked >
27912791 %8 = ttg.convert_layout %7 : tensor <32 x16 xf32 , #mma1 > -> tensor <32 x16 xf32 , #blocked >
27922792 tt.store %addr , %8 : tensor <32 x16 x!tt.ptr <f32 >, #blocked >
@@ -2992,7 +2992,7 @@ tt.func @hoist_multiple_conditional(
29922992 }
29932993 %2 = arith.addf %0 , %1 : tensor <128 x32 xf32 , #blocked >
29942994 %3 = ttg.convert_layout %2 : tensor <128 x32 xf32 , #blocked > -> tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>>
2995- %4 = tt.dot %3 , %arg4 , %arg5 : tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <32 x128 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <128 x128 xf32 , #mma >
2995+ %4 = tt.dot %3 , %arg4 , %arg5 , inputPrecision = tf32 : tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <32 x128 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <128 x128 xf32 , #mma >
29962996 tt.return %4 : tensor <128 x128 xf32 , #mma >
29972997}
29982998
@@ -3021,7 +3021,7 @@ tt.func @hoist_across_loop(
30213021 }
30223022 // CHECK-NOT: ttg.convert_layout
30233023 %2 = ttg.convert_layout %1 : tensor <128 x32 xf32 , #blocked > -> tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>>
3024- %3 = tt.dot %2 , %arg2 , %acc : tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <32 x128 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <128 x128 xf32 , #mma >
3024+ %3 = tt.dot %2 , %arg2 , %acc , inputPrecision = tf32 : tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <32 x128 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <128 x128 xf32 , #mma >
30253025 scf.yield %1 , %3 : tensor <128 x32 xf32 , #blocked >, tensor <128 x128 xf32 , #mma >
30263026 }
30273027 tt.return %0#1 : tensor <128 x128 xf32 , #mma >
@@ -3335,7 +3335,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:80"} {
33353335// CHECK-DAG: %[[AEXT:.*]] = arith.extf %[[ACVT]] : tensor<16x16xf16, #ttg.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 2}>> to tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 2}>>
33363336// CHECK-DAG: %[[BCVT:.*]] = ttg.convert_layout %[[BLOAD]] : tensor<16x16xf16, #[[BB]]> -> tensor<16x16xf16, #ttg.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>>
33373337// CHECK-DAG: %[[BEXT:.*]] = arith.extf %[[BCVT]] : tensor<16x16xf16, #ttg.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>> to tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>>
3338- // CHECK-DAG: tt.dot %[[AEXT]], %[[BEXT]], %{{.*}} : tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 2}>> * tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>> -> tensor<16x16xf32, #mma>
3338+ // CHECK-DAG: tt.dot %[[AEXT]], %[[BEXT]], %{{.*}}, inputPrecision = tf32 : tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 2}>> * tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>> -> tensor<16x16xf32, #mma>
33393339tt.func @push_convert_both_operands (
33403340 %pa: tensor <16 x16 x!tt.ptr <f16 >, #blockedA > {tt.divisibility =16 : i32 , tt.contiguity =2 : i32 },
33413341 %pb: tensor <16 x16 x!tt.ptr <f16 >, #blockedB > {tt.divisibility =16 : i32 , tt.contiguity =2 : i32 },
@@ -3346,7 +3346,7 @@ tt.func @push_convert_both_operands(
33463346 %be = arith.extf %b : tensor <16 x16 xf16 , #blockedB > to tensor <16 x16 xf32 , #blockedB >
33473347 %al = ttg.convert_layout %ae : tensor <16 x16 xf32 , #blockedA > -> tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>>
33483348 %bl = ttg.convert_layout %be : tensor <16 x16 xf32 , #blockedB > -> tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>>
3349- %r = tt.dot %al , %bl , %c : tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <16 x16 xf32 , #mma >
3349+ %r = tt.dot %al , %bl , %c , inputPrecision = tf32 : tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <16 x16 xf32 , #mma >
33503350 tt.return %r : tensor <16 x16 xf32 , #mma >
33513351}
33523352
@@ -3372,7 +3372,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:80"} {
33723372// CHECK-DAG: %[[BCVT:.*]] = ttg.convert_layout %[[BLOAD]] : tensor<16x16xf16, #[[BB]]> -> tensor<16x16xf16, #ttg.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>>
33733373// CHECK-DAG: %[[BEXT:.*]] = arith.extf %[[BCVT]] : tensor<16x16xf16, #ttg.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>> to tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>>
33743374// CHECK-DAG: %[[ADD:.+]] = arith.addf %[[BEXT]], %[[CST]] : tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>>
3375- // CHECK-DAG: tt.dot %[[AEXT]], %[[ADD]], %{{.*}} : tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 2}>> * tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>> -> tensor<16x16xf32, #mma>
3375+ // CHECK-DAG: tt.dot %[[AEXT]], %[[ADD]], %{{.*}}, inputPrecision = tf32 : tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 2}>> * tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>> -> tensor<16x16xf32, #mma>
33763376tt.func @update_kwidth_slice (
33773377 %pa: tensor <16 x16 x!tt.ptr <f16 >, #blockedA > {tt.divisibility =16 : i32 , tt.contiguity =2 : i32 },
33783378 %pb: tensor <16 x16 x!tt.ptr <f16 >, #blockedB > {tt.divisibility =16 : i32 , tt.contiguity =2 : i32 },
@@ -3385,7 +3385,7 @@ tt.func @update_kwidth_slice(
33853385 %add = arith.addf %be , %cst : tensor <16 x16 xf32 , #blockedB >
33863386 %al = ttg.convert_layout %ae : tensor <16 x16 xf32 , #blockedA > -> tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>>
33873387 %bl = ttg.convert_layout %add : tensor <16 x16 xf32 , #blockedB > -> tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>>
3388- %r = tt.dot %al , %bl , %c : tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <16 x16 xf32 , #mma >
3388+ %r = tt.dot %al , %bl , %c , inputPrecision = tf32 : tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <16 x16 xf32 , #mma >
33893389 tt.return %r : tensor <16 x16 xf32 , #mma >
33903390}
33913391}
@@ -3403,7 +3403,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
34033403 %cst2 = arith.constant dense <1.000000e+00 > : tensor <64 x32 xf32 , #mma >
34043404 %0 = tt.elementwise_inline_asm " cvt.rna.tf32.f32 $0, $1;" {constraints = " =r,r" , packed_element = 1 : i32 , pure = true } %cst : tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #blocked }>> -> tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #blocked }>>
34053405 %1 = ttg.convert_layout %0 : tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #blocked }>> -> tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 1 }>>
3406- %2 = tt.dot %cst1 , %1 , %cst2 : tensor <64 x128 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 1 }>> * tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 1 }>> -> tensor <64 x32 xf32 , #mma >
3406+ %2 = tt.dot %cst1 , %1 , %cst2 , inputPrecision = tf32 : tensor <64 x128 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 1 }>> * tensor <128 x32 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 1 }>> -> tensor <64 x32 xf32 , #mma >
34073407 tt.return %2 : tensor <64 x32 xf32 , #mma >
34083408 }
34093409}
@@ -3484,7 +3484,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:80"} {
34843484 %a = tt.load %pa2 : tensor <16 x16 x!tt.ptr <f16 >, #blocked >
34853485 %ae = arith.extf %a : tensor <16 x16 xf16 , #blocked > to tensor <16 x16 xf32 , #blocked >
34863486 %ac = ttg.convert_layout %ae : tensor <16 x16 xf32 , #blocked > -> tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>>
3487- %r = tt.dot %ac , %b , %c : tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <16 x16 xf32 , #mma >
3487+ %r = tt.dot %ac , %b , %c , inputPrecision = tf32 : tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <16 x16 xf32 , #mma >
34883488 tt.return %r : tensor <16 x16 xf32 , #mma >
34893489 }
34903490}
@@ -3581,7 +3581,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:80"} {
35813581 %aa = arith.addf %ab , %a2 : tensor <16 x16 xf16 , #blocked >
35823582 %ae = arith.extf %aa : tensor <16 x16 xf16 , #blocked > to tensor <16 x16 xf32 , #blocked >
35833583 %ac = ttg.convert_layout %ae : tensor <16 x16 xf32 , #blocked > -> tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>>
3584- %r = tt.dot %ac , %b , %c : tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <16 x16 xf32 , #mma >
3584+ %r = tt.dot %ac , %b , %c , inputPrecision = tf32 : tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 2 }>> * tensor <16 x16 xf32 , #ttg.dot_op <{opIdx = 1 , parent = #mma , kWidth = 2 }>> -> tensor <16 x16 xf32 , #mma >
35853585 tt.return %r : tensor <16 x16 xf32 , #mma >
35863586 }
35873587}
0 commit comments