@@ -773,25 +773,12 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
773
773
#blocked0 = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [8 , 4 ], warpsPerCTA = [1 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
774
774
#blocked1 = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [4 , 8 ], warpsPerCTA = [1 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
775
775
module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 } {
776
- // CHECK: llvm.func spir_funccc @_Z7barrierj( i32) attributes {convergent, no_unwind, will_return}
776
+ // CHECK: llvm.func spir_funccc @_Z17sub_group_shufflefj(f32, i32) -> f32 attributes {convergent, no_unwind, will_return}
777
777
// CHECK-LABEL: convert_layout_blocked_blocked_multi_rep
778
778
tt.func @convert_layout_blocked_blocked_multi_rep (%arg0: tensor <16 x16 xf32 , #blocked0 >) {
779
- // CHECK: llvm.store
780
- // CHECK-SAME: vector<4xf32>, !llvm.ptr<3>
781
- // CHECK: [[ONE_1:%.*]] = llvm.mlir.constant(1 : i32) : i32
782
- // CHECK-NEXT: llvm.call spir_funccc @_Z7barrierj([[ONE_1]]) {{.*}} : (i32) -> ()
783
- // CHECK: llvm.load
784
- // CHECK-SAME: !llvm.ptr<3> -> vector<4xf32>
785
- // CHECK: llvm.load
786
- // CHECK-SAME: !llvm.ptr<3> -> vector<4xf32>
787
- // CHECK: llvm.call spir_funccc @_Z7barrierj({{.*}}) {{.*}} : (i32) -> ()
788
- // CHECK: llvm.store
789
- // CHECK-SAME: vector<4xf32>, !llvm.ptr<3>
790
- // CHECK: llvm.call spir_funccc @_Z7barrierj({{.*}}) {{.*}} : (i32) -> ()
791
- // CHECK: llvm.load
792
- // CHECK-SAME: !llvm.ptr<3> -> vector<4xf32>
793
- // CHECK: llvm.load
794
- // CHECK-SAME: !llvm.ptr<3> -> vector<4xf32>
779
+ // CHECK-COUNT-8: llvm.extractvalue %arg0{{.*}} : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
780
+ // CHECK-COUNT-4: llvm.call spir_funccc @_Z17sub_group_shufflefj({{.*}}, {{.*}}) {convergent, no_unwind, will_return} : (f32, i32) -> f32
781
+ // CHECK-COUNT-16: llvm.insertvalue {{.*}}, {{.*}} : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
795
782
%0 = ttg.convert_layout %arg0 : tensor <16 x16 xf32 , #blocked0 > -> tensor <16 x16 xf32 , #blocked1 >
796
783
tt.return
797
784
}
0 commit comments