@@ -2237,4 +2237,217 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
22372237 ret <4 x float > %result
22382238}
22392239
2240+ ; --------------------------------------------------------------------
2241+ ; llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8
2242+ ; --------------------------------------------------------------------
2243+
2244+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2245+
2246+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2247+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
2248+ ; SDAG: ; %bb.0: ; %bb
2249+ ; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2250+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2251+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2252+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2253+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2254+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2255+ ; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
2256+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2257+ ; SDAG-NEXT: v_mov_b32_e32 v12, s4
2258+ ; SDAG-NEXT: v_mov_b32_e32 v13, s5
2259+ ; SDAG-NEXT: v_mov_b32_e32 v14, s6
2260+ ; SDAG-NEXT: v_mov_b32_e32 v15, s7
2261+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2262+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2263+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2264+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2265+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2266+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2267+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2268+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2269+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2270+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2271+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2272+ ; SDAG-NEXT: s_nop 0
2273+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2274+ ; SDAG-NEXT: s_nop 6
2275+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
2276+ ; SDAG-NEXT: s_endpgm
2277+ ;
2278+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
2279+ ; GISEL: ; %bb.0: ; %bb
2280+ ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2281+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2282+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2283+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2284+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2285+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2286+ ; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
2287+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2288+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[6:7]
2289+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
2290+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2291+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2292+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2293+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2294+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2295+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2296+ ; GISEL-NEXT: s_nop 0
2297+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2298+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2299+ ; GISEL-NEXT: s_nop 5
2300+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
2301+ ; GISEL-NEXT: s_endpgm
2302+ bb:
2303+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2304+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2305+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2306+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2307+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2308+ ret void
2309+ }
2310+
2311+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2312+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
2313+ ; SDAG: ; %bb.0:
2314+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2315+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2316+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2317+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2318+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2319+ ; SDAG-NEXT: s_nop 1
2320+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16
2321+ ; SDAG-NEXT: s_nop 6
2322+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2323+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2324+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2325+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2326+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2327+ ;
2328+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
2329+ ; GISEL: ; %bb.0:
2330+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2331+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
2332+ ; GISEL-NEXT: s_nop 6
2333+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2334+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2335+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2336+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2337+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2338+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2339+ ret <4 x float > %result
2340+ }
2341+
2342+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2343+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
2344+ ; SDAG: ; %bb.0:
2345+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2346+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2347+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2348+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2349+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2350+ ; SDAG-NEXT: s_nop 1
2351+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2352+ ; SDAG-NEXT: s_nop 6
2353+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2354+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2355+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2356+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2357+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2358+ ;
2359+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
2360+ ; GISEL: ; %bb.0:
2361+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2362+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2363+ ; GISEL-NEXT: s_nop 6
2364+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2365+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2366+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2367+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2368+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2369+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2370+ ret <4 x float > %result
2371+ }
2372+
2373+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2374+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
2375+ ; SDAG: ; %bb.0:
2376+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2377+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2378+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2379+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2380+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2381+ ; SDAG-NEXT: s_nop 1
2382+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2383+ ; SDAG-NEXT: s_nop 6
2384+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2385+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2386+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2387+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2388+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2389+ ;
2390+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
2391+ ; GISEL: ; %bb.0:
2392+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2393+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2394+ ; GISEL-NEXT: s_nop 6
2395+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2396+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2397+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2398+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2399+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2400+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2401+ ret <4 x float > %result
2402+ }
2403+
2404+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_fp8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2405+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2406+ ; SDAG: ; %bb.0:
2407+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2408+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2409+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2410+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2411+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2412+ ; SDAG-NEXT: v_mov_b32_e32 v0, s4
2413+ ; SDAG-NEXT: v_mov_b32_e32 v1, s5
2414+ ; SDAG-NEXT: v_mov_b32_e32 v2, s6
2415+ ; SDAG-NEXT: v_mov_b32_e32 v3, s7
2416+ ; SDAG-NEXT: v_mov_b32_e32 v4, s8
2417+ ; SDAG-NEXT: v_mov_b32_e32 v5, s9
2418+ ; SDAG-NEXT: v_mov_b32_e32 v6, s10
2419+ ; SDAG-NEXT: v_mov_b32_e32 v7, s11
2420+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
2421+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
2422+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
2423+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
2424+ ; SDAG-NEXT: v_mov_b32_e32 v12, s16
2425+ ; SDAG-NEXT: s_nop 1
2426+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
2427+ ; SDAG-NEXT: s_nop 6
2428+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2429+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2430+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2431+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2432+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2433+ ;
2434+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2435+ ; GISEL: ; %bb.0:
2436+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2437+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2438+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2439+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
2440+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2441+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
2442+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
2443+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
2444+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2445+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2446+ ; GISEL-NEXT: s_nop 1
2447+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
2448+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2449+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2450+ ret <4 x float > %result
2451+ }
2452+
22402453attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
0 commit comments