@@ -2540,6 +2540,221 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
25402540 ret <4 x float > %result
25412541}
25422542
2543+ ; --------------------------------------------------------------------
2544+ ; llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8
2545+ ; --------------------------------------------------------------------
2546+
2547+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2548+
2549+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2550+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
2551+ ; SDAG: ; %bb.0: ; %bb
2552+ ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2553+ ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2554+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2555+ ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2556+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2557+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2558+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
2559+ ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
2560+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
2561+ ; SDAG-NEXT: v_mov_b32_e32 v12, s8
2562+ ; SDAG-NEXT: v_mov_b32_e32 v13, s9
2563+ ; SDAG-NEXT: v_mov_b32_e32 v14, s10
2564+ ; SDAG-NEXT: v_mov_b32_e32 v15, s11
2565+ ; SDAG-NEXT: v_mov_b32_e32 v0, s12
2566+ ; SDAG-NEXT: v_mov_b32_e32 v1, s13
2567+ ; SDAG-NEXT: v_mov_b32_e32 v2, s14
2568+ ; SDAG-NEXT: v_mov_b32_e32 v3, s15
2569+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2570+ ; SDAG-NEXT: v_mov_b32_e32 v4, s0
2571+ ; SDAG-NEXT: v_mov_b32_e32 v5, s1
2572+ ; SDAG-NEXT: v_mov_b32_e32 v6, s2
2573+ ; SDAG-NEXT: v_mov_b32_e32 v7, s3
2574+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2575+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2576+ ; SDAG-NEXT: s_nop 0
2577+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2578+ ; SDAG-NEXT: s_nop 6
2579+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
2580+ ; SDAG-NEXT: s_endpgm
2581+ ;
2582+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
2583+ ; GISEL: ; %bb.0: ; %bb
2584+ ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2585+ ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2586+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2587+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2588+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
2589+ ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2590+ ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2591+ ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
2592+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2593+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
2594+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
2595+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2596+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2597+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2598+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2599+ ; GISEL-NEXT: v_mov_b32_e32 v16, s2
2600+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2601+ ; GISEL-NEXT: s_nop 0
2602+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2603+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2604+ ; GISEL-NEXT: s_nop 5
2605+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
2606+ ; GISEL-NEXT: s_endpgm
2607+ bb:
2608+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2609+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2610+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2611+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2612+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2613+ ret void
2614+ }
2615+
2616+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2617+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
2618+ ; SDAG: ; %bb.0:
2619+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2620+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2621+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2622+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2623+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2624+ ; SDAG-NEXT: s_nop 1
2625+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16
2626+ ; SDAG-NEXT: s_nop 6
2627+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2628+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2629+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2630+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2631+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2632+ ;
2633+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
2634+ ; GISEL: ; %bb.0:
2635+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2636+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
2637+ ; GISEL-NEXT: s_nop 6
2638+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2639+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2640+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2641+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2642+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2643+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2644+ ret <4 x float > %result
2645+ }
2646+
2647+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2648+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
2649+ ; SDAG: ; %bb.0:
2650+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2651+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2652+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2653+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2654+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2655+ ; SDAG-NEXT: s_nop 1
2656+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2657+ ; SDAG-NEXT: s_nop 6
2658+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2659+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2660+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2661+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2662+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2663+ ;
2664+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
2665+ ; GISEL: ; %bb.0:
2666+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2667+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2668+ ; GISEL-NEXT: s_nop 6
2669+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2670+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2671+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2672+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2673+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2674+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2675+ ret <4 x float > %result
2676+ }
2677+
2678+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2679+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
2680+ ; SDAG: ; %bb.0:
2681+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2682+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2683+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2684+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2685+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2686+ ; SDAG-NEXT: s_nop 1
2687+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2688+ ; SDAG-NEXT: s_nop 6
2689+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2690+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2691+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2692+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2693+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2694+ ;
2695+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
2696+ ; GISEL: ; %bb.0:
2697+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2698+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2699+ ; GISEL-NEXT: s_nop 6
2700+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2701+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2702+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2703+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2704+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2705+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2706+ ret <4 x float > %result
2707+ }
2708+
2709+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2710+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
2711+ ; SDAG: ; %bb.0:
2712+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2713+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2714+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2715+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2716+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2717+ ; SDAG-NEXT: v_mov_b32_e32 v0, s16
2718+ ; SDAG-NEXT: v_mov_b32_e32 v1, s17
2719+ ; SDAG-NEXT: v_mov_b32_e32 v2, s18
2720+ ; SDAG-NEXT: v_mov_b32_e32 v3, s19
2721+ ; SDAG-NEXT: v_mov_b32_e32 v4, s20
2722+ ; SDAG-NEXT: v_mov_b32_e32 v5, s21
2723+ ; SDAG-NEXT: v_mov_b32_e32 v6, s22
2724+ ; SDAG-NEXT: v_mov_b32_e32 v7, s23
2725+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
2726+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
2727+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
2728+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
2729+ ; SDAG-NEXT: v_mov_b32_e32 v12, s28
2730+ ; SDAG-NEXT: s_nop 1
2731+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
2732+ ; SDAG-NEXT: s_nop 6
2733+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2734+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2735+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2736+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2737+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2738+ ;
2739+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
2740+ ; GISEL: ; %bb.0:
2741+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2742+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2743+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2744+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2745+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
2746+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2747+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
2748+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
2749+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
2750+ ; GISEL-NEXT: v_mov_b32_e32 v16, s28
2751+ ; GISEL-NEXT: s_nop 1
2752+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
2753+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2754+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2755+ ret <4 x float > %result
2756+ }
2757+
25432758attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
25442759;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
25452760; GCN: {{.*}}
0 commit comments