@@ -23,13 +23,13 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
23
23
// CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
24
24
// CHECK-NEXT: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
25
25
// CHECK: scf.for %[[IV:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args({{.*}}) -> (tensor<128x256xf32, #mma>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>)
26
- // WORKGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Workgroup> <Workgroup> <None>
27
- // SUBGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Subgroup> <Subgroup> <None>
26
+ // WORKGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = WorkGroup, memory_scope = WorkGroup}
27
+ // SUBGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = SubGroup, memory_scope = SubGroup}
28
28
// CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
29
29
// CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>
30
30
// CHECK: tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]>
31
- // WORKGROUP_SCOPE: spirv.INTEL.ControlBarrierWait <Workgroup> <Workgroup> <None>
32
- // SUBGROUP_SCOPE: spirv.INTEL.ControlBarrierWait <Subgroup> <Subgroup> <None>
31
+ // WORKGROUP_SCOPE: triton_gen.split_barrier_wait {execution_scope = WorkGroup, memory_scope = WorkGroup}
32
+ // SUBGROUP_SCOPE: triton_gen.split_barrier_wait {execution_scope = SubGroup, memory_scope = SubGroup}
33
33
// CHECK-NEXT: scf.yield
34
34
%23:3 = scf.for %arg2 = %c0_i32 to %c64_i32 step %c64_i32 iter_args (%arg3 = %cst , %arg4 = %18 , %arg5 = %22 ) -> (tensor <128 x256 xf32 , #dpas >, !tt.ptr <tensor <128 x64 xf16 , #dot0 >>, !tt.ptr <tensor <64 x256 xf16 , #dot1 >>) : i32 {
35
35
%55:3 = scf.for %arg9 = %c0_i32 to %c64_i32 step %c64_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <128 x256 xf32 , #dpas >, !tt.ptr <tensor <128 x64 xf16 , #dot0 >>, !tt.ptr <tensor <64 x256 xf16 , #dot1 >>) : i32 {
@@ -70,13 +70,13 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
70
70
// CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
71
71
// CHECK-NEXT: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
72
72
// CHECK: scf.for %[[IV:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args({{.*}}) -> (tensor<128x256xf32, #mma>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>)
73
- // WORKGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Workgroup> <Workgroup> <None>
74
- // SUBGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Subgroup> <Subgroup> <None>
73
+ // WORKGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = WorkGroup, memory_scope = WorkGroup}
74
+ // SUBGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = SubGroup, memory_scope = SubGroup}
75
75
// CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
76
76
// CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>
77
77
// CHECK: tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]>
78
- // WORKGROUP_SCOPE: spirv.INTEL.ControlBarrierWait <Workgroup> <Workgroup> <None>
79
- // SUBGROUP_SCOPE: spirv.INTEL.ControlBarrierWait <Subgroup> <Subgroup> <None>
78
+ // WORKGROUP_SCOPE: triton_gen.split_barrier_wait {execution_scope = WorkGroup, memory_scope = WorkGroup}
79
+ // SUBGROUP_SCOPE: triton_gen.split_barrier_wait {execution_scope = SubGroup, memory_scope = SubGroup}
80
80
// CHECK-NEXT: scf.yield
81
81
%23:3 = scf.for %arg9 = %c0_i32 to %c64_i32 step %c64_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <128 x256 xf32 , #dpas >, !tt.ptr <tensor <128 x64 xf16 , #dot0 >>, !tt.ptr <tensor <64 x256 xf16 , #dot1 >>) : i32 {
82
82
%56 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >, ttig.block_io = " row_major" } : !tt.ptr <tensor <128 x64 xf16 , #dot0 >>
0 commit comments