Skip to content

Commit ec59e3c

Browse files
committed
AMDGPU: Select VGPR MFMAs by default
AGPRs are undesirable since they are only usable by a handful instructions like loads, stores and mfmas and everything else requires copies to/from VGPRs. Using the AGPR form should be a measure of last resort if we must use more than 256 VGPRs.
1 parent a1934ee commit ec59e3c

28 files changed

+10541
-10925
lines changed

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ static cl::opt<bool, true> MFMAVGPRFormOpt(
3737
"amdgpu-mfma-vgpr-form",
3838
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
3939
"unspecified, default to compiler heuristics"),
40-
cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(false),
40+
cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(true),
4141
cl::Hidden);
4242

4343
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll

Lines changed: 128 additions & 172 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST
3-
# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-fast -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-greedy -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY
44

55
---
66
name: mfma_f32_32x32x4bf16_1k_vva

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST
3-
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-fast -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-greedy -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY
44

55
---
66
name: mfma_i32_16x16x32_i8_vva

llvm/test/CodeGen/AMDGPU/acc-ldst.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
2-
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
1+
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 -early-live-intervals < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
33

44
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
55
declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
3-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-mfma-vgpr-form=0 -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
44

55
; This testcase would fail on GFX908 due to not having a free VGPR available to
66
; copy between AGPRs.

llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
22

3-
; Make sure flag is ignored
4-
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-vgpr-form=1 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
5-
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
3+
; Make sure flag is ignored for gfx908
4+
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-vgpr-form=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
5+
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
66

77
; GFX9-DAG: buffer_load_format_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding:
88
; GFX9-DAG: buffer_load_format_d16_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding:

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll

Lines changed: 123 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -6,146 +6,144 @@ define amdgpu_kernel void @MFMAExpInterleave(ptr addrspace(1) %out0, ptr addrspa
66
; GCN: ; %bb.0:
77
; GCN-NEXT: s_load_dword s6, s[4:5], 0x10
88
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
9-
; GCN-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b
10-
; GCN-NEXT: v_mov_b32_e32 v0, 1.0
9+
; GCN-NEXT: v_mov_b32_e32 v5, 0x3fb8aa3b
10+
; GCN-NEXT: v_mov_b32_e32 v4, 1.0
1111
; GCN-NEXT: s_mov_b32 s7, 0x42b17218
1212
; GCN-NEXT: s_waitcnt lgkmcnt(0)
13-
; GCN-NEXT: v_mul_f32_e32 v2, s6, v1
14-
; GCN-NEXT: v_rndne_f32_e32 v3, v2
15-
; GCN-NEXT: v_sub_f32_e32 v4, v2, v3
16-
; GCN-NEXT: v_fma_f32 v1, s6, v1, -v2
17-
; GCN-NEXT: v_mov_b32_e32 v2, 0x32a5705f
18-
; GCN-NEXT: v_accvgpr_write_b32 a0, s0
19-
; GCN-NEXT: v_fmac_f32_e32 v1, s6, v2
20-
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
21-
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
22-
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
23-
; GCN-NEXT: v_add_f32_e32 v1, v4, v1
24-
; GCN-NEXT: v_cvt_i32_f32_e32 v2, v3
25-
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
26-
; GCN-NEXT: v_exp_f32_e32 v1, v1
13+
; GCN-NEXT: v_mul_f32_e32 v6, s6, v5
14+
; GCN-NEXT: v_rndne_f32_e32 v7, v6
15+
; GCN-NEXT: v_sub_f32_e32 v8, v6, v7
16+
; GCN-NEXT: v_fma_f32 v5, s6, v5, -v6
17+
; GCN-NEXT: v_mov_b32_e32 v6, 0x32a5705f
18+
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19+
; GCN-NEXT: v_fmac_f32_e32 v5, s6, v6
20+
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
21+
; GCN-NEXT: v_add_f32_e32 v5, v8, v5
22+
; GCN-NEXT: v_cvt_i32_f32_e32 v6, v7
23+
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
24+
; GCN-NEXT: v_exp_f32_e32 v5, v5
2725
; GCN-NEXT: s_mov_b32 s0, 0x3fb8aa3b
28-
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
26+
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
2927
; GCN-NEXT: ; iglp_opt mask(0x00000003)
30-
; GCN-NEXT: v_ldexp_f32 v1, v1, v2
31-
; GCN-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0
32-
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v2
33-
; GCN-NEXT: v_mov_b32_e32 v2, 0x42b17218
28+
; GCN-NEXT: v_ldexp_f32 v5, v5, v6
29+
; GCN-NEXT: v_mov_b32_e32 v6, 0xc2ce8ed0
30+
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v6
31+
; GCN-NEXT: v_mov_b32_e32 v6, 0x42b17218
3432
; GCN-NEXT: s_nop 0
35-
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
36-
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v2
37-
; GCN-NEXT: v_mov_b32_e32 v2, 0x7f800000
33+
; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
34+
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6
35+
; GCN-NEXT: v_mov_b32_e32 v6, 0x7f800000
3836
; GCN-NEXT: s_mov_b32 s6, 0xc2ce8ed0
39-
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
40-
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
41-
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
42-
; GCN-NEXT: v_rndne_f32_e32 v5, v3
43-
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
44-
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
45-
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
46-
; GCN-NEXT: v_exp_f32_e32 v3, v3
47-
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
48-
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
49-
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
50-
; GCN-NEXT: v_ldexp_f32 v3, v3, v4
51-
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
52-
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
37+
; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
38+
; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
39+
; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
40+
; GCN-NEXT: v_rndne_f32_e32 v9, v7
41+
; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
42+
; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
43+
; GCN-NEXT: v_add_f32_e32 v7, v7, v8
44+
; GCN-NEXT: v_exp_f32_e32 v7, v7
45+
; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
46+
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
47+
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
48+
; GCN-NEXT: v_ldexp_f32 v7, v7, v8
49+
; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
50+
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
5351
; GCN-NEXT: s_nop 1
54-
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
55-
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
56-
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
57-
; GCN-NEXT: v_rndne_f32_e32 v5, v3
58-
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
59-
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
60-
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
61-
; GCN-NEXT: v_exp_f32_e32 v3, v3
62-
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
63-
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
64-
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
65-
; GCN-NEXT: v_ldexp_f32 v3, v3, v4
66-
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
67-
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
52+
; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
53+
; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
54+
; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
55+
; GCN-NEXT: v_rndne_f32_e32 v9, v7
56+
; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
57+
; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
58+
; GCN-NEXT: v_add_f32_e32 v7, v7, v8
59+
; GCN-NEXT: v_exp_f32_e32 v7, v7
60+
; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
61+
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
62+
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
63+
; GCN-NEXT: v_ldexp_f32 v7, v7, v8
64+
; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
65+
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
6866
; GCN-NEXT: s_nop 1
69-
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
70-
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
71-
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
72-
; GCN-NEXT: v_rndne_f32_e32 v5, v3
73-
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
74-
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
75-
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
76-
; GCN-NEXT: v_exp_f32_e32 v3, v3
77-
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
78-
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
79-
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
80-
; GCN-NEXT: v_ldexp_f32 v3, v3, v4
81-
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
82-
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
67+
; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
68+
; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
69+
; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
70+
; GCN-NEXT: v_rndne_f32_e32 v9, v7
71+
; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
72+
; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
73+
; GCN-NEXT: v_add_f32_e32 v7, v7, v8
74+
; GCN-NEXT: v_exp_f32_e32 v7, v7
75+
; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
76+
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
77+
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
78+
; GCN-NEXT: v_ldexp_f32 v7, v7, v8
79+
; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
80+
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
8381
; GCN-NEXT: s_nop 1
84-
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
85-
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
86-
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
87-
; GCN-NEXT: v_rndne_f32_e32 v5, v3
88-
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
89-
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
90-
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
91-
; GCN-NEXT: v_exp_f32_e32 v3, v3
92-
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
93-
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
94-
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
95-
; GCN-NEXT: v_ldexp_f32 v3, v3, v4
96-
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
97-
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
82+
; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
83+
; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
84+
; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
85+
; GCN-NEXT: v_rndne_f32_e32 v9, v7
86+
; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
87+
; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
88+
; GCN-NEXT: v_add_f32_e32 v7, v7, v8
89+
; GCN-NEXT: v_exp_f32_e32 v7, v7
90+
; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
91+
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
92+
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
93+
; GCN-NEXT: v_ldexp_f32 v7, v7, v8
94+
; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
95+
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
9896
; GCN-NEXT: s_nop 1
99-
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
100-
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
101-
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
102-
; GCN-NEXT: v_rndne_f32_e32 v5, v3
103-
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
104-
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
105-
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
106-
; GCN-NEXT: v_exp_f32_e32 v3, v3
107-
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
108-
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
109-
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
110-
; GCN-NEXT: v_ldexp_f32 v3, v3, v4
111-
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
112-
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
97+
; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
98+
; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
99+
; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
100+
; GCN-NEXT: v_rndne_f32_e32 v9, v7
101+
; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
102+
; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
103+
; GCN-NEXT: v_add_f32_e32 v7, v7, v8
104+
; GCN-NEXT: v_exp_f32_e32 v7, v7
105+
; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
106+
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
107+
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
108+
; GCN-NEXT: v_ldexp_f32 v7, v7, v8
109+
; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
110+
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
113111
; GCN-NEXT: s_nop 1
114-
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
115-
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
116-
; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
117-
; GCN-NEXT: v_rndne_f32_e32 v5, v3
118-
; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
119-
; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
120-
; GCN-NEXT: v_add_f32_e32 v3, v3, v4
121-
; GCN-NEXT: v_exp_f32_e32 v3, v3
122-
; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
123-
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
124-
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
125-
; GCN-NEXT: v_ldexp_f32 v0, v3, v4
112+
; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
113+
; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
114+
; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
115+
; GCN-NEXT: v_rndne_f32_e32 v9, v7
116+
; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
117+
; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
118+
; GCN-NEXT: v_add_f32_e32 v7, v7, v8
119+
; GCN-NEXT: v_exp_f32_e32 v7, v7
120+
; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
121+
; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
122+
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
123+
; GCN-NEXT: v_ldexp_f32 v4, v7, v8
126124
; GCN-NEXT: s_nop 0
127-
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
128-
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
125+
; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
126+
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
129127
; GCN-NEXT: s_nop 1
130-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
131-
; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
132-
; GCN-NEXT: v_fma_f32 v3, v0, s0, -v1
133-
; GCN-NEXT: v_rndne_f32_e32 v4, v1
134-
; GCN-NEXT: v_fmac_f32_e32 v3, 0x32a5705f, v0
135-
; GCN-NEXT: v_sub_f32_e32 v1, v1, v4
136-
; GCN-NEXT: v_add_f32_e32 v1, v1, v3
137-
; GCN-NEXT: v_exp_f32_e32 v1, v1
138-
; GCN-NEXT: v_cvt_i32_f32_e32 v3, v4
128+
; GCN-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
129+
; GCN-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v4
130+
; GCN-NEXT: v_fma_f32 v7, v4, s0, -v5
131+
; GCN-NEXT: v_rndne_f32_e32 v8, v5
132+
; GCN-NEXT: v_fmac_f32_e32 v7, 0x32a5705f, v4
133+
; GCN-NEXT: v_sub_f32_e32 v5, v5, v8
134+
; GCN-NEXT: v_add_f32_e32 v5, v5, v7
135+
; GCN-NEXT: v_exp_f32_e32 v5, v5
136+
; GCN-NEXT: v_cvt_i32_f32_e32 v7, v8
139137
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
140-
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v0
141-
; GCN-NEXT: v_mov_b32_e32 v4, 0
142-
; GCN-NEXT: v_ldexp_f32 v1, v1, v3
143-
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
144-
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v0
138+
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4
139+
; GCN-NEXT: v_mov_b32_e32 v8, 0
140+
; GCN-NEXT: v_ldexp_f32 v5, v5, v7
141+
; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
142+
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4
145143
; GCN-NEXT: s_waitcnt lgkmcnt(0)
146-
; GCN-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
147-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
148-
; GCN-NEXT: global_store_dword v4, v0, s[2:3]
144+
; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
145+
; GCN-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
146+
; GCN-NEXT: global_store_dword v8, v4, s[2:3]
149147
; GCN-NEXT: s_endpgm
150148
%mai0 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in1, i32 0, i32 0, i32 0)
151149
%mai1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai0, i32 0, i32 0, i32 0)

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GCN %s
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -check-prefix=GCN %s
33

44
define amdgpu_kernel void @test_iglp_opt() #0 {
55
; GCN-LABEL: test_iglp_opt:

0 commit comments

Comments
 (0)