@@ -2450,4 +2450,217 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
24502450 ret <4 x float > %result
24512451}
24522452
2453+ ; --------------------------------------------------------------------
2454+ ; llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8
2455+ ; --------------------------------------------------------------------
2456+
2457+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2458+
2459+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2460+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
2461+ ; SDAG: ; %bb.0: ; %bb
2462+ ; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2463+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2464+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2465+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2466+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2467+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2468+ ; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
2469+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2470+ ; SDAG-NEXT: v_mov_b32_e32 v12, s4
2471+ ; SDAG-NEXT: v_mov_b32_e32 v13, s5
2472+ ; SDAG-NEXT: v_mov_b32_e32 v14, s6
2473+ ; SDAG-NEXT: v_mov_b32_e32 v15, s7
2474+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2475+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2476+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2477+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2478+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2479+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2480+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2481+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2482+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2483+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2484+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2485+ ; SDAG-NEXT: s_nop 0
2486+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2487+ ; SDAG-NEXT: s_nop 6
2488+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
2489+ ; SDAG-NEXT: s_endpgm
2490+ ;
2491+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
2492+ ; GISEL: ; %bb.0: ; %bb
2493+ ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2494+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2495+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2496+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2497+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2498+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2499+ ; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
2500+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2501+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[6:7]
2502+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
2503+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2504+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2505+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2506+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2507+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2508+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2509+ ; GISEL-NEXT: s_nop 0
2510+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2511+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2512+ ; GISEL-NEXT: s_nop 5
2513+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
2514+ ; GISEL-NEXT: s_endpgm
2515+ bb:
2516+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2517+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2518+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2519+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2520+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2521+ ret void
2522+ }
2523+
2524+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2525+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
2526+ ; SDAG: ; %bb.0:
2527+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2528+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2529+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2530+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2531+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2532+ ; SDAG-NEXT: s_nop 1
2533+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16
2534+ ; SDAG-NEXT: s_nop 6
2535+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2536+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2537+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2538+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2539+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2540+ ;
2541+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
2542+ ; GISEL: ; %bb.0:
2543+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2544+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
2545+ ; GISEL-NEXT: s_nop 6
2546+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2547+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2548+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2549+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2550+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2551+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2552+ ret <4 x float > %result
2553+ }
2554+
2555+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2556+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
2557+ ; SDAG: ; %bb.0:
2558+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2559+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2560+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2561+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2562+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2563+ ; SDAG-NEXT: s_nop 1
2564+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2565+ ; SDAG-NEXT: s_nop 6
2566+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2567+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2568+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2569+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2570+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2571+ ;
2572+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
2573+ ; GISEL: ; %bb.0:
2574+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2575+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2576+ ; GISEL-NEXT: s_nop 6
2577+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2578+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2579+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2580+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2581+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2582+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2583+ ret <4 x float > %result
2584+ }
2585+
2586+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2587+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
2588+ ; SDAG: ; %bb.0:
2589+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2590+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2591+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2592+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2593+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2594+ ; SDAG-NEXT: s_nop 1
2595+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2596+ ; SDAG-NEXT: s_nop 6
2597+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2598+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2599+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2600+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2601+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2602+ ;
2603+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
2604+ ; GISEL: ; %bb.0:
2605+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2606+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2607+ ; GISEL-NEXT: s_nop 6
2608+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2609+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2610+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2611+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2612+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2613+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2614+ ret <4 x float > %result
2615+ }
2616+
2617+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2618+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
2619+ ; SDAG: ; %bb.0:
2620+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2621+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2622+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2623+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2624+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2625+ ; SDAG-NEXT: v_mov_b32_e32 v0, s4
2626+ ; SDAG-NEXT: v_mov_b32_e32 v1, s5
2627+ ; SDAG-NEXT: v_mov_b32_e32 v2, s6
2628+ ; SDAG-NEXT: v_mov_b32_e32 v3, s7
2629+ ; SDAG-NEXT: v_mov_b32_e32 v4, s8
2630+ ; SDAG-NEXT: v_mov_b32_e32 v5, s9
2631+ ; SDAG-NEXT: v_mov_b32_e32 v6, s10
2632+ ; SDAG-NEXT: v_mov_b32_e32 v7, s11
2633+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
2634+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
2635+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
2636+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
2637+ ; SDAG-NEXT: v_mov_b32_e32 v12, s16
2638+ ; SDAG-NEXT: s_nop 1
2639+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
2640+ ; SDAG-NEXT: s_nop 6
2641+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2642+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2643+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2644+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2645+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2646+ ;
2647+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
2648+ ; GISEL: ; %bb.0:
2649+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2650+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2651+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2652+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
2653+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2654+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
2655+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
2656+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
2657+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2658+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2659+ ; GISEL-NEXT: s_nop 1
2660+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
2661+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2662+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2663+ ret <4 x float > %result
2664+ }
2665+
24532666attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
0 commit comments