Skip to content

Commit 0b36339

Browse files
committed
Latest fixes
1 parent 32d932f commit 0b36339

File tree

6 files changed

+606
-532
lines changed

6 files changed

+606
-532
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6688,7 +6688,6 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
66886688
SelectionDAG &DAG) const {
66896689
unsigned Opc = Op.getOpcode();
66906690
EVT VT = Op.getValueType();
6691-
VT.dump();
66926691
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
66936692
VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
66946693
VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||

llvm/test/Analysis/CostModel/AMDGPU/fadd.ll

Lines changed: 121 additions & 121 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/AMDGPU/fma.ll

Lines changed: 120 additions & 120 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/AMDGPU/fmul.ll

Lines changed: 149 additions & 149 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/AMDGPU/fsub.ll

Lines changed: 121 additions & 121 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 95 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,10 @@
77
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
88
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
99
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
10-
<<<<<<< HEAD
1110
; xUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16
1211
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16
1312

1413
; FIXME: real-true16 version of gfx1250 test fails
15-
=======
16-
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefixes=GFX1250
17-
>>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
1814

1915
define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2016
; GCN-LABEL: test_load_store:
@@ -49042,6 +49038,9 @@ declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
4904249038
declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
4904349039
declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
4904449040
declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
49041+
declare <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
49042+
declare <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>)
49043+
declare <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat>, <32 x bfloat>, <32 x bfloat>)
4904549044

4904649045
define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
4904749046
; GCN-LABEL: v_fma_bf16:
@@ -49363,10 +49362,7 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
4936349362
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
4936449363
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
4936549364
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
49366-
<<<<<<< HEAD
4936749365
;
49368-
=======
49369-
>>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
4937049366
; GFX1250-LABEL: v_fma_v2bf16:
4937149367
; GFX1250: ; %bb.0:
4937249368
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -49641,7 +49637,6 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
4964149637
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
4964249638
; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16
4964349639
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
49644-
<<<<<<< HEAD
4964549640
;
4964649641
; GFX1250-LABEL: v_fma_v3bf16:
4964749642
; GFX1250: ; %bb.0:
@@ -49650,15 +49645,6 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
4965049645
; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4
4965149646
; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5
4965249647
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
49653-
=======
49654-
; GFX1250-LABEL: v_fma_v3bf16:
49655-
; GFX1250: %bb.0:
49656-
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
49657-
; GFX1250-NEXT: s_wait_kmcnt 0x0
49658-
; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4
49659-
; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5
49660-
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
49661-
>>>>>>> cc3762e87c75 (Add testing coverage - part I)
4966249648
%op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
4966349649
ret <3 x bfloat> %op
4966449650
}
@@ -49993,10 +49979,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
4999349979
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
4999449980
; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
4999549981
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
49996-
<<<<<<< HEAD
4999749982
;
49998-
=======
49999-
>>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
5000049983
; GFX1250-LABEL: v_fma_v4bf16:
5000149984
; GFX1250: ; %bb.0:
5000249985
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -50008,6 +49991,98 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
5000849991
ret <4 x bfloat> %op
5000949992
}
5001049993

49994+
; GFX1250-LABEL: v_fma_v8bf16:
49995+
; GFX1250: ; %bb.0:
49996+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
49997+
; GFX1250-NEXT: s_wait_kmcnt 0x0
49998+
; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v4, v8
49999+
; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v5, v9
50000+
; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v6, v10
50001+
; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v7, v11
50002+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
50003+
define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
50004+
%op = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c)
50005+
ret <8 x bfloat> %op
50006+
}
50007+
50008+
; GFX1250-LABEL: v_fma_v16bf16:
50009+
; GFX1250: ; %bb.0:
50010+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
50011+
; GFX1250-NEXT: s_wait_kmcnt 0x0
50012+
; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v8, v16
50013+
; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v9, v17
50014+
; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v10, v18
50015+
; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v11, v19
50016+
; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v12, v20
50017+
; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v13, v21
50018+
; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v14, v22
50019+
; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v15, v23
50020+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
50021+
define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) {
50022+
%op = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c)
50023+
ret <16 x bfloat> %op
50024+
}
50025+
50026+
; GFX1250-LABEL: v_fma_v32bf16:
50027+
; GFX1250: ; %bb.0:
50028+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
50029+
; GFX1250-NEXT: s_wait_kmcnt 0x0
50030+
; GFX1250-NEXT: s_clause 0x10
50031+
; GFX1250-NEXT: scratch_load_b32 v31, off, s32 offset:64
50032+
; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4
50033+
; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8
50034+
; GFX1250-NEXT: scratch_load_b32 v34, off, s32 offset:12
50035+
; GFX1250-NEXT: scratch_load_b32 v35, off, s32 offset:16
50036+
; GFX1250-NEXT: scratch_load_b32 v36, off, s32 offset:20
50037+
; GFX1250-NEXT: scratch_load_b32 v37, off, s32 offset:24
50038+
; GFX1250-NEXT: scratch_load_b32 v38, off, s32 offset:28
50039+
; GFX1250-NEXT: scratch_load_b32 v39, off, s32 offset:32
50040+
; GFX1250-NEXT: scratch_load_b32 v48, off, s32 offset:36
50041+
; GFX1250-NEXT: scratch_load_b32 v49, off, s32 offset:40
50042+
; GFX1250-NEXT: scratch_load_b32 v50, off, s32 offset:44
50043+
; GFX1250-NEXT: scratch_load_b32 v51, off, s32 offset:48
50044+
; GFX1250-NEXT: scratch_load_b32 v52, off, s32 offset:52
50045+
; GFX1250-NEXT: scratch_load_b32 v53, off, s32 offset:56
50046+
; GFX1250-NEXT: scratch_load_b32 v54, off, s32 offset:60
50047+
; GFX1250-NEXT: scratch_load_b32 v55, off, s32
50048+
; GFX1250-NEXT: s_wait_loadcnt 0xf
50049+
; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v16, v32
50050+
; GFX1250-NEXT: s_wait_loadcnt 0xe
50051+
; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v17, v33
50052+
; GFX1250-NEXT: s_wait_loadcnt 0xd
50053+
; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v18, v34
50054+
; GFX1250-NEXT: s_wait_loadcnt 0xc
50055+
; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v19, v35
50056+
; GFX1250-NEXT: s_wait_loadcnt 0xb
50057+
; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v20, v36
50058+
; GFX1250-NEXT: s_wait_loadcnt 0xa
50059+
; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v21, v37
50060+
; GFX1250-NEXT: s_wait_loadcnt 0x9
50061+
; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v22, v38
50062+
; GFX1250-NEXT: s_wait_loadcnt 0x8
50063+
; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v23, v39
50064+
; GFX1250-NEXT: s_wait_loadcnt 0x7
50065+
; GFX1250-NEXT: v_pk_fma_bf16 v8, v8, v24, v48
50066+
; GFX1250-NEXT: s_wait_loadcnt 0x6
50067+
; GFX1250-NEXT: v_pk_fma_bf16 v9, v9, v25, v49
50068+
; GFX1250-NEXT: s_wait_loadcnt 0x5
50069+
; GFX1250-NEXT: v_pk_fma_bf16 v10, v10, v26, v50
50070+
; GFX1250-NEXT: s_wait_loadcnt 0x4
50071+
; GFX1250-NEXT: v_pk_fma_bf16 v11, v11, v27, v51
50072+
; GFX1250-NEXT: s_wait_loadcnt 0x3
50073+
; GFX1250-NEXT: v_pk_fma_bf16 v12, v12, v28, v52
50074+
; GFX1250-NEXT: s_wait_loadcnt 0x2
50075+
; GFX1250-NEXT: v_pk_fma_bf16 v13, v13, v29, v53
50076+
; GFX1250-NEXT: s_wait_loadcnt 0x1
50077+
; GFX1250-NEXT: v_pk_fma_bf16 v14, v14, v30, v54
50078+
; GFX1250-NEXT: s_wait_loadcnt 0x0
50079+
; GFX1250-NEXT: v_pk_fma_bf16 v15, v15, v55, v31
50080+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
50081+
define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) {
50082+
%op = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c)
50083+
ret <32 x bfloat> %op
50084+
}
50085+
5001150086
declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
5001250087
declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
5001350088
declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)

0 commit comments

Comments
 (0)