77; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
88; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
99; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
10- <<<<<<< HEAD
1110; xUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16
1211; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16
1312
1413; FIXME: real-true16 version of gfx1250 test fails
15- =======
16- ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefixes=GFX1250
17- >>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
1814
1915define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2016; GCN-LABEL: test_load_store:
@@ -49042,6 +49038,9 @@ declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
4904249038declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
4904349039declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
4904449040declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
49041+ declare <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
49042+ declare <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>)
49043+ declare <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat>, <32 x bfloat>, <32 x bfloat>)
4904549044
4904649045define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
4904749046; GCN-LABEL: v_fma_bf16:
@@ -49363,10 +49362,7 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
4936349362; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
4936449363; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
4936549364; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
49366- <<<<<<< HEAD
4936749365;
49368- =======
49369- >>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
4937049366; GFX1250-LABEL: v_fma_v2bf16:
4937149367; GFX1250: ; %bb.0:
4937249368; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -49641,7 +49637,6 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
4964149637; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
4964249638; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16
4964349639; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
49644- <<<<<<< HEAD
4964549640;
4964649641; GFX1250-LABEL: v_fma_v3bf16:
4964749642; GFX1250: ; %bb.0:
@@ -49650,15 +49645,6 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
4965049645; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4
4965149646; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5
4965249647; GFX1250-NEXT: s_set_pc_i64 s[30:31]
49653- =======
49654- ; GFX1250-LABEL: v_fma_v3bf16:
49655- ; GFX1250: %bb.0:
49656- ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
49657- ; GFX1250-NEXT: s_wait_kmcnt 0x0
49658- ; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4
49659- ; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5
49660- ; GFX1250-NEXT: s_set_pc_i64 s[30:31]
49661- >>>>>>> cc3762e87c75 (Add testing coverage - part I)
4966249648 %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
4966349649 ret <3 x bfloat> %op
4966449650}
@@ -49993,10 +49979,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
4999349979; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
4999449980; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
4999549981; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
49996- <<<<<<< HEAD
4999749982;
49998- =======
49999- >>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
5000049983; GFX1250-LABEL: v_fma_v4bf16:
5000149984; GFX1250: ; %bb.0:
5000249985; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -50008,6 +49991,98 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
5000849991 ret <4 x bfloat> %op
5000949992}
5001049993
49994+ ; GFX1250-LABEL: v_fma_v8bf16:
49995+ ; GFX1250: ; %bb.0:
49996+ ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
49997+ ; GFX1250-NEXT: s_wait_kmcnt 0x0
49998+ ; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v4, v8
49999+ ; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v5, v9
50000+ ; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v6, v10
50001+ ; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v7, v11
50002+ ; GFX1250-NEXT: s_set_pc_i64 s[30:31]
50003+ define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
50004+ %op = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c)
50005+ ret <8 x bfloat> %op
50006+ }
50007+
50008+ ; GFX1250-LABEL: v_fma_v16bf16:
50009+ ; GFX1250: ; %bb.0:
50010+ ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
50011+ ; GFX1250-NEXT: s_wait_kmcnt 0x0
50012+ ; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v8, v16
50013+ ; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v9, v17
50014+ ; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v10, v18
50015+ ; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v11, v19
50016+ ; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v12, v20
50017+ ; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v13, v21
50018+ ; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v14, v22
50019+ ; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v15, v23
50020+ ; GFX1250-NEXT: s_set_pc_i64 s[30:31]
50021+ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) {
50022+ %op = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c)
50023+ ret <16 x bfloat> %op
50024+ }
50025+
50026+ ; GFX1250-LABEL: v_fma_v32bf16:
50027+ ; GFX1250: ; %bb.0:
50028+ ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
50029+ ; GFX1250-NEXT: s_wait_kmcnt 0x0
50030+ ; GFX1250-NEXT: s_clause 0x10
50031+ ; GFX1250-NEXT: scratch_load_b32 v31, off, s32 offset:64
50032+ ; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4
50033+ ; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8
50034+ ; GFX1250-NEXT: scratch_load_b32 v34, off, s32 offset:12
50035+ ; GFX1250-NEXT: scratch_load_b32 v35, off, s32 offset:16
50036+ ; GFX1250-NEXT: scratch_load_b32 v36, off, s32 offset:20
50037+ ; GFX1250-NEXT: scratch_load_b32 v37, off, s32 offset:24
50038+ ; GFX1250-NEXT: scratch_load_b32 v38, off, s32 offset:28
50039+ ; GFX1250-NEXT: scratch_load_b32 v39, off, s32 offset:32
50040+ ; GFX1250-NEXT: scratch_load_b32 v48, off, s32 offset:36
50041+ ; GFX1250-NEXT: scratch_load_b32 v49, off, s32 offset:40
50042+ ; GFX1250-NEXT: scratch_load_b32 v50, off, s32 offset:44
50043+ ; GFX1250-NEXT: scratch_load_b32 v51, off, s32 offset:48
50044+ ; GFX1250-NEXT: scratch_load_b32 v52, off, s32 offset:52
50045+ ; GFX1250-NEXT: scratch_load_b32 v53, off, s32 offset:56
50046+ ; GFX1250-NEXT: scratch_load_b32 v54, off, s32 offset:60
50047+ ; GFX1250-NEXT: scratch_load_b32 v55, off, s32
50048+ ; GFX1250-NEXT: s_wait_loadcnt 0xf
50049+ ; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v16, v32
50050+ ; GFX1250-NEXT: s_wait_loadcnt 0xe
50051+ ; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v17, v33
50052+ ; GFX1250-NEXT: s_wait_loadcnt 0xd
50053+ ; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v18, v34
50054+ ; GFX1250-NEXT: s_wait_loadcnt 0xc
50055+ ; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v19, v35
50056+ ; GFX1250-NEXT: s_wait_loadcnt 0xb
50057+ ; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v20, v36
50058+ ; GFX1250-NEXT: s_wait_loadcnt 0xa
50059+ ; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v21, v37
50060+ ; GFX1250-NEXT: s_wait_loadcnt 0x9
50061+ ; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v22, v38
50062+ ; GFX1250-NEXT: s_wait_loadcnt 0x8
50063+ ; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v23, v39
50064+ ; GFX1250-NEXT: s_wait_loadcnt 0x7
50065+ ; GFX1250-NEXT: v_pk_fma_bf16 v8, v8, v24, v48
50066+ ; GFX1250-NEXT: s_wait_loadcnt 0x6
50067+ ; GFX1250-NEXT: v_pk_fma_bf16 v9, v9, v25, v49
50068+ ; GFX1250-NEXT: s_wait_loadcnt 0x5
50069+ ; GFX1250-NEXT: v_pk_fma_bf16 v10, v10, v26, v50
50070+ ; GFX1250-NEXT: s_wait_loadcnt 0x4
50071+ ; GFX1250-NEXT: v_pk_fma_bf16 v11, v11, v27, v51
50072+ ; GFX1250-NEXT: s_wait_loadcnt 0x3
50073+ ; GFX1250-NEXT: v_pk_fma_bf16 v12, v12, v28, v52
50074+ ; GFX1250-NEXT: s_wait_loadcnt 0x2
50075+ ; GFX1250-NEXT: v_pk_fma_bf16 v13, v13, v29, v53
50076+ ; GFX1250-NEXT: s_wait_loadcnt 0x1
50077+ ; GFX1250-NEXT: v_pk_fma_bf16 v14, v14, v30, v54
50078+ ; GFX1250-NEXT: s_wait_loadcnt 0x0
50079+ ; GFX1250-NEXT: v_pk_fma_bf16 v15, v15, v55, v31
50080+ ; GFX1250-NEXT: s_set_pc_i64 s[30:31]
50081+ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) {
50082+ %op = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c)
50083+ ret <32 x bfloat> %op
50084+ }
50085+
5001150086declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
5001250087declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
5001350088declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
0 commit comments