@@ -23,13 +23,13 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
2323 // CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
2424 // CHECK-NEXT: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
2525 // CHECK: scf.for %[[IV:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args({{.*}}) -> (tensor<128x256xf32, #mma>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>)
26- // WORKGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Workgroup> <Workgroup> <None>
27- // SUBGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Subgroup> <Subgroup> <None>
26+ // WORKGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = WorkGroup, memory_scope = WorkGroup}
27+ // SUBGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = SubGroup, memory_scope = SubGroup}
2828 // CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
2929 // CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>
3030 // CHECK: tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]>
31- // WORKGROUP_SCOPE: spirv.INTEL.ControlBarrierWait <Workgroup> <Workgroup> <None>
32- // SUBGROUP_SCOPE: spirv.INTEL.ControlBarrierWait <Subgroup> <Subgroup> <None>
31+ // WORKGROUP_SCOPE: triton_gen.split_barrier_wait {execution_scope = WorkGroup, memory_scope = WorkGroup}
32+ // SUBGROUP_SCOPE: triton_gen.split_barrier_wait {execution_scope = SubGroup, memory_scope = SubGroup}
3333 // CHECK-NEXT: scf.yield
3434 %23:3 = scf.for %arg2 = %c0_i32 to %c64_i32 step %c64_i32 iter_args (%arg3 = %cst , %arg4 = %18 , %arg5 = %22 ) -> (tensor <128 x256 xf32 , #dpas >, !tt.ptr <tensor <128 x64 xf16 , #dot0 >>, !tt.ptr <tensor <64 x256 xf16 , #dot1 >>) : i32 {
3535 %55:3 = scf.for %arg9 = %c0_i32 to %c64_i32 step %c64_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <128 x256 xf32 , #dpas >, !tt.ptr <tensor <128 x64 xf16 , #dot0 >>, !tt.ptr <tensor <64 x256 xf16 , #dot1 >>) : i32 {
@@ -70,13 +70,13 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
7070 // CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
7171 // CHECK-NEXT: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
7272 // CHECK: scf.for %[[IV:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args({{.*}}) -> (tensor<128x256xf32, #mma>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>)
73- // WORKGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Workgroup> <Workgroup> <None>
74- // SUBGROUP_SCOPE-NEXT: spirv.INTEL.ControlBarrierArrive <Subgroup> <Subgroup> <None>
73+ // WORKGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = WorkGroup, memory_scope = WorkGroup}
74+ // SUBGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = SubGroup, memory_scope = SubGroup}
7575 // CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
7676 // CHECK: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>
7777 // CHECK: tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]>
78- // WORKGROUP_SCOPE: spirv.INTEL.ControlBarrierWait <Workgroup> <Workgroup> <None>
79- // SUBGROUP_SCOPE: spirv.INTEL.ControlBarrierWait <Subgroup> <Subgroup> <None>
78+ // WORKGROUP_SCOPE: triton_gen.split_barrier_wait {execution_scope = WorkGroup, memory_scope = WorkGroup}
79+ // SUBGROUP_SCOPE: triton_gen.split_barrier_wait {execution_scope = SubGroup, memory_scope = SubGroup}
8080 // CHECK-NEXT: scf.yield
8181 %23:3 = scf.for %arg9 = %c0_i32 to %c64_i32 step %c64_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <128 x256 xf32 , #dpas >, !tt.ptr <tensor <128 x64 xf16 , #dot0 >>, !tt.ptr <tensor <64 x256 xf16 , #dot1 >>) : i32 {
8282 %56 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >, ttig.block_io = " row_major" } : !tt.ptr <tensor <128 x64 xf16 , #dot0 >>
0 commit comments