@@ -2663,4 +2663,217 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
26632663 ret <4 x float > %result
26642664}
26652665
2666+ ; --------------------------------------------------------------------
2667+ ; llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8
2668+ ; --------------------------------------------------------------------
2669+
2670+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2671+
2672+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2673+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2674+ ; SDAG: ; %bb.0: ; %bb
2675+ ; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2676+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2677+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2678+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2679+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2680+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2681+ ; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
2682+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2683+ ; SDAG-NEXT: v_mov_b32_e32 v12, s4
2684+ ; SDAG-NEXT: v_mov_b32_e32 v13, s5
2685+ ; SDAG-NEXT: v_mov_b32_e32 v14, s6
2686+ ; SDAG-NEXT: v_mov_b32_e32 v15, s7
2687+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2688+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2689+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2690+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2691+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2692+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2693+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2694+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2695+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2696+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2697+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2698+ ; SDAG-NEXT: s_nop 0
2699+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2700+ ; SDAG-NEXT: s_nop 6
2701+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
2702+ ; SDAG-NEXT: s_endpgm
2703+ ;
2704+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2705+ ; GISEL: ; %bb.0: ; %bb
2706+ ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2707+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2708+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2709+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2710+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2711+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2712+ ; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
2713+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2714+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[6:7]
2715+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
2716+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2717+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2718+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2719+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2720+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2721+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2722+ ; GISEL-NEXT: s_nop 0
2723+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2724+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2725+ ; GISEL-NEXT: s_nop 5
2726+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
2727+ ; GISEL-NEXT: s_endpgm
2728+ bb:
2729+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2730+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2731+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2732+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2733+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2734+ ret void
2735+ }
2736+
2737+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2738+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2739+ ; SDAG: ; %bb.0:
2740+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2741+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2742+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2743+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2744+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2745+ ; SDAG-NEXT: s_nop 1
2746+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16
2747+ ; SDAG-NEXT: s_nop 6
2748+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2749+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2750+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2751+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2752+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2753+ ;
2754+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2755+ ; GISEL: ; %bb.0:
2756+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2757+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
2758+ ; GISEL-NEXT: s_nop 6
2759+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2760+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2761+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2762+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2763+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2764+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2765+ ret <4 x float > %result
2766+ }
2767+
2768+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2769+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2770+ ; SDAG: ; %bb.0:
2771+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2772+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2773+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2774+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2775+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2776+ ; SDAG-NEXT: s_nop 1
2777+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2778+ ; SDAG-NEXT: s_nop 6
2779+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2780+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2781+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2782+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2783+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2784+ ;
2785+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2786+ ; GISEL: ; %bb.0:
2787+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2788+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2789+ ; GISEL-NEXT: s_nop 6
2790+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2791+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2792+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2793+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2794+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2795+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2796+ ret <4 x float > %result
2797+ }
2798+
2799+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2800+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2801+ ; SDAG: ; %bb.0:
2802+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2803+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2804+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2805+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2806+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2807+ ; SDAG-NEXT: s_nop 1
2808+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2809+ ; SDAG-NEXT: s_nop 6
2810+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2811+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2812+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2813+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2814+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2815+ ;
2816+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2817+ ; GISEL: ; %bb.0:
2818+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2819+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2820+ ; GISEL-NEXT: s_nop 6
2821+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2822+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2823+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2824+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2825+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2826+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2827+ ret <4 x float > %result
2828+ }
2829+
2830+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2831+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2832+ ; SDAG: ; %bb.0:
2833+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2834+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2835+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2836+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2837+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2838+ ; SDAG-NEXT: v_mov_b32_e32 v0, s4
2839+ ; SDAG-NEXT: v_mov_b32_e32 v1, s5
2840+ ; SDAG-NEXT: v_mov_b32_e32 v2, s6
2841+ ; SDAG-NEXT: v_mov_b32_e32 v3, s7
2842+ ; SDAG-NEXT: v_mov_b32_e32 v4, s8
2843+ ; SDAG-NEXT: v_mov_b32_e32 v5, s9
2844+ ; SDAG-NEXT: v_mov_b32_e32 v6, s10
2845+ ; SDAG-NEXT: v_mov_b32_e32 v7, s11
2846+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
2847+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
2848+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
2849+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
2850+ ; SDAG-NEXT: v_mov_b32_e32 v12, s16
2851+ ; SDAG-NEXT: s_nop 1
2852+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
2853+ ; SDAG-NEXT: s_nop 6
2854+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2855+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2856+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2857+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2858+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2859+ ;
2860+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2861+ ; GISEL: ; %bb.0:
2862+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2863+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2864+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2865+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
2866+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2867+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
2868+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
2869+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
2870+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2871+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2872+ ; GISEL-NEXT: s_nop 1
2873+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
2874+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2875+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2876+ ret <4 x float > %result
2877+ }
2878+
26662879attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
0 commit comments