7
7
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
8
8
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
9
9
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
10
- <<<<<<< HEAD
11
10
; xUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16
12
11
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16
13
12
14
13
; FIXME: real-true16 version of gfx1250 test fails
15
- =======
16
- ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefixes=GFX1250
17
- >>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
18
14
19
15
define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
20
16
; GCN-LABEL: test_load_store:
@@ -49042,6 +49038,9 @@ declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
49042
49038
declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
49043
49039
declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
49044
49040
declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
49041
+ declare <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
49042
+ declare <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>)
49043
+ declare <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat>, <32 x bfloat>, <32 x bfloat>)
49045
49044
49046
49045
define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
49047
49046
; GCN-LABEL: v_fma_bf16:
@@ -49363,10 +49362,7 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
49363
49362
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
49364
49363
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
49365
49364
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
49366
- <<<<<<< HEAD
49367
49365
;
49368
- =======
49369
- >>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
49370
49366
; GFX1250-LABEL: v_fma_v2bf16:
49371
49367
; GFX1250: ; %bb.0:
49372
49368
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -49641,7 +49637,6 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
49641
49637
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
49642
49638
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16
49643
49639
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
49644
- <<<<<<< HEAD
49645
49640
;
49646
49641
; GFX1250-LABEL: v_fma_v3bf16:
49647
49642
; GFX1250: ; %bb.0:
@@ -49650,15 +49645,6 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
49650
49645
; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4
49651
49646
; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5
49652
49647
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
49653
- =======
49654
- ; GFX1250-LABEL: v_fma_v3bf16:
49655
- ; GFX1250: %bb.0:
49656
- ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
49657
- ; GFX1250-NEXT: s_wait_kmcnt 0x0
49658
- ; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4
49659
- ; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5
49660
- ; GFX1250-NEXT: s_set_pc_i64 s[30:31]
49661
- >>>>>>> cc3762e87c75 (Add testing coverage - part I)
49662
49648
%op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
49663
49649
ret <3 x bfloat> %op
49664
49650
}
@@ -49993,10 +49979,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
49993
49979
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
49994
49980
; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
49995
49981
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
49996
- <<<<<<< HEAD
49997
49982
;
49998
- =======
49999
- >>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
50000
49983
; GFX1250-LABEL: v_fma_v4bf16:
50001
49984
; GFX1250: ; %bb.0:
50002
49985
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -50008,6 +49991,98 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
50008
49991
ret <4 x bfloat> %op
50009
49992
}
50010
49993
49994
+ ; GFX1250-LABEL: v_fma_v8bf16:
49995
+ ; GFX1250: ; %bb.0:
49996
+ ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
49997
+ ; GFX1250-NEXT: s_wait_kmcnt 0x0
49998
+ ; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v4, v8
49999
+ ; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v5, v9
50000
+ ; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v6, v10
50001
+ ; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v7, v11
50002
+ ; GFX1250-NEXT: s_set_pc_i64 s[30:31]
50003
+ define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
50004
+ %op = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c)
50005
+ ret <8 x bfloat> %op
50006
+ }
50007
+
50008
+ ; GFX1250-LABEL: v_fma_v16bf16:
50009
+ ; GFX1250: ; %bb.0:
50010
+ ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
50011
+ ; GFX1250-NEXT: s_wait_kmcnt 0x0
50012
+ ; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v8, v16
50013
+ ; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v9, v17
50014
+ ; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v10, v18
50015
+ ; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v11, v19
50016
+ ; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v12, v20
50017
+ ; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v13, v21
50018
+ ; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v14, v22
50019
+ ; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v15, v23
50020
+ ; GFX1250-NEXT: s_set_pc_i64 s[30:31]
50021
+ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) {
50022
+ %op = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c)
50023
+ ret <16 x bfloat> %op
50024
+ }
50025
+
50026
+ ; GFX1250-LABEL: v_fma_v32bf16:
50027
+ ; GFX1250: ; %bb.0:
50028
+ ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
50029
+ ; GFX1250-NEXT: s_wait_kmcnt 0x0
50030
+ ; GFX1250-NEXT: s_clause 0x10
50031
+ ; GFX1250-NEXT: scratch_load_b32 v31, off, s32 offset:64
50032
+ ; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4
50033
+ ; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8
50034
+ ; GFX1250-NEXT: scratch_load_b32 v34, off, s32 offset:12
50035
+ ; GFX1250-NEXT: scratch_load_b32 v35, off, s32 offset:16
50036
+ ; GFX1250-NEXT: scratch_load_b32 v36, off, s32 offset:20
50037
+ ; GFX1250-NEXT: scratch_load_b32 v37, off, s32 offset:24
50038
+ ; GFX1250-NEXT: scratch_load_b32 v38, off, s32 offset:28
50039
+ ; GFX1250-NEXT: scratch_load_b32 v39, off, s32 offset:32
50040
+ ; GFX1250-NEXT: scratch_load_b32 v48, off, s32 offset:36
50041
+ ; GFX1250-NEXT: scratch_load_b32 v49, off, s32 offset:40
50042
+ ; GFX1250-NEXT: scratch_load_b32 v50, off, s32 offset:44
50043
+ ; GFX1250-NEXT: scratch_load_b32 v51, off, s32 offset:48
50044
+ ; GFX1250-NEXT: scratch_load_b32 v52, off, s32 offset:52
50045
+ ; GFX1250-NEXT: scratch_load_b32 v53, off, s32 offset:56
50046
+ ; GFX1250-NEXT: scratch_load_b32 v54, off, s32 offset:60
50047
+ ; GFX1250-NEXT: scratch_load_b32 v55, off, s32
50048
+ ; GFX1250-NEXT: s_wait_loadcnt 0xf
50049
+ ; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v16, v32
50050
+ ; GFX1250-NEXT: s_wait_loadcnt 0xe
50051
+ ; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v17, v33
50052
+ ; GFX1250-NEXT: s_wait_loadcnt 0xd
50053
+ ; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v18, v34
50054
+ ; GFX1250-NEXT: s_wait_loadcnt 0xc
50055
+ ; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v19, v35
50056
+ ; GFX1250-NEXT: s_wait_loadcnt 0xb
50057
+ ; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v20, v36
50058
+ ; GFX1250-NEXT: s_wait_loadcnt 0xa
50059
+ ; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v21, v37
50060
+ ; GFX1250-NEXT: s_wait_loadcnt 0x9
50061
+ ; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v22, v38
50062
+ ; GFX1250-NEXT: s_wait_loadcnt 0x8
50063
+ ; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v23, v39
50064
+ ; GFX1250-NEXT: s_wait_loadcnt 0x7
50065
+ ; GFX1250-NEXT: v_pk_fma_bf16 v8, v8, v24, v48
50066
+ ; GFX1250-NEXT: s_wait_loadcnt 0x6
50067
+ ; GFX1250-NEXT: v_pk_fma_bf16 v9, v9, v25, v49
50068
+ ; GFX1250-NEXT: s_wait_loadcnt 0x5
50069
+ ; GFX1250-NEXT: v_pk_fma_bf16 v10, v10, v26, v50
50070
+ ; GFX1250-NEXT: s_wait_loadcnt 0x4
50071
+ ; GFX1250-NEXT: v_pk_fma_bf16 v11, v11, v27, v51
50072
+ ; GFX1250-NEXT: s_wait_loadcnt 0x3
50073
+ ; GFX1250-NEXT: v_pk_fma_bf16 v12, v12, v28, v52
50074
+ ; GFX1250-NEXT: s_wait_loadcnt 0x2
50075
+ ; GFX1250-NEXT: v_pk_fma_bf16 v13, v13, v29, v53
50076
+ ; GFX1250-NEXT: s_wait_loadcnt 0x1
50077
+ ; GFX1250-NEXT: v_pk_fma_bf16 v14, v14, v30, v54
50078
+ ; GFX1250-NEXT: s_wait_loadcnt 0x0
50079
+ ; GFX1250-NEXT: v_pk_fma_bf16 v15, v15, v55, v31
50080
+ ; GFX1250-NEXT: s_set_pc_i64 s[30:31]
50081
+ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) {
50082
+ %op = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c)
50083
+ ret <32 x bfloat> %op
50084
+ }
50085
+
50011
50086
declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
50012
50087
declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
50013
50088
declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
0 commit comments