Skip to content

Commit ec237da

Browse files
authored
[AMDGPU][True16][CodeGen] insert proper register for 16bit data type in vop3p insts (#153143)
In true16 flow, we cannot simply replace v2f16 to its Lo16 when Lo == Hi in a vop3p packed inst, since the register size is mismatched. This trigger functional errors in the downstream branch and this is caused by illegal `VGPR_32 = COPY VGPR_16` created by ISel and hit the rewrite virtual reg and coalescer pass Correctly insert reg_sequence/s_mov in true16 flow
1 parent d57ab27 commit ec237da

File tree

2 files changed

+151
-1
lines changed

2 files changed

+151
-1
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,40 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
7676
return false;
7777
}
7878

79+
static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
80+
llvm::SelectionDAG *CurDAG,
81+
const GCNSubtarget *Subtarget) {
82+
if (!Subtarget->useRealTrue16Insts()) {
83+
return Lo;
84+
}
85+
86+
SDValue NewSrc;
87+
SDLoc SL(Lo);
88+
89+
if (Lo->isDivergent()) {
90+
SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
91+
SL, Lo.getValueType()),
92+
0);
93+
const SDValue Ops[] = {
94+
CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
95+
CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
96+
CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
97+
98+
NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
99+
Src.getValueType(), Ops),
100+
0);
101+
} else {
102+
// the S_MOV is needed since the Lo could still be a VGPR16.
103+
// With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
104+
// the fixvgpr2sgprcopy pass to legalize it
105+
NewSrc = SDValue(
106+
CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
107+
0);
108+
}
109+
110+
return NewSrc;
111+
}
112+
79113
// Look through operations that obscure just looking at the low 16-bits of the
80114
// same register.
81115
static SDValue stripExtractLoElt(SDValue In) {
@@ -3412,8 +3446,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
34123446
// Really a scalar input. Just select from the low half of the register to
34133447
// avoid packing.
34143448

3415-
if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3449+
if (VecSize == Lo.getValueSizeInBits()) {
34163450
Src = Lo;
3451+
} else if (VecSize == 32) {
3452+
Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
34173453
} else {
34183454
assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
34193455

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-isel -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11
3+
4+
@const_half = internal constant half 1.0
5+
6+
define amdgpu_kernel void @fma_v2f16_divergent(
7+
; GFX11-LABEL: name: fma_v2f16_divergent
8+
; GFX11: bb.0 (%ir-block.0):
9+
; GFX11-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
10+
; GFX11-NEXT: {{ $}}
11+
; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
12+
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
13+
; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.r.kernarg.offset, align 4, addrspace 4)
14+
; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 68, 0 :: (dereferenceable invariant load (s32) from %ir.d.kernarg.offset, addrspace 4)
15+
; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
16+
; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
17+
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
18+
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
19+
; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
20+
; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
21+
; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
22+
; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
23+
; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
24+
; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
25+
; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
26+
; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
27+
; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1
28+
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
29+
; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
30+
; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
31+
; GFX11-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
32+
; GFX11-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE1]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.f.gep, addrspace 1)
33+
; GFX11-NEXT: [[V_AND_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_AND_B16_t16_e64 0, 32767, 0, [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, implicit $exec
34+
; GFX11-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, -32768, 0, [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, implicit $exec
35+
; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
36+
; GFX11-NEXT: S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def $scc
37+
; GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
38+
; GFX11-NEXT: [[V_CNDMASK_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CNDMASK_B16_t16_e64 0, killed [[V_XOR_B16_t16_e64_]], 0, killed [[V_AND_B16_t16_e64_]], killed [[COPY10]], 0, implicit $exec
39+
; GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
40+
; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
41+
; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_CNDMASK_B16_t16_e64_]], %subreg.lo16, killed [[DEF]], %subreg.hi16
42+
; GFX11-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE2]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.4, addrspace 1)
43+
; GFX11-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE3]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.5, addrspace 1)
44+
; GFX11-NEXT: [[V_PK_FMA_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 0, killed [[REG_SEQUENCE4]], 8, killed [[S_LOAD_DWORD_IMM1]], 8, killed [[S_LOAD_DWORD_IMM2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
45+
; GFX11-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[V_PK_FMA_F16_]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
46+
; GFX11-NEXT: S_ENDPGM 0
47+
ptr addrspace(1) %r,
48+
ptr addrspace(1) %fptr,
49+
ptr addrspace(1) %b,
50+
ptr addrspace(1) %c,
51+
i32 %d) {
52+
53+
%idx = call i32 @llvm.amdgcn.workitem.id.x() #1
54+
%f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
55+
%f = load half, ptr addrspace(1) %f.gep
56+
%f.abs = call half @llvm.fabs.f16(half %f)
57+
%f.neg = fneg half %f
58+
%setcc = icmp ne i32 %d, 0
59+
%select = select i1 %setcc, half %f.abs, half %f.neg
60+
%vec = insertelement <2 x half> poison, half %select, i32 0
61+
%a.val = insertelement <2 x half> %vec, half %select, i32 1
62+
%b.v = load i32, ptr addrspace(1) %b
63+
%b.val = bitcast i32 %b.v to <2 x half>
64+
%c.v = load i32, ptr addrspace(1) %c
65+
%c.val = bitcast i32 %c.v to <2 x half>
66+
%r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
67+
store <2 x half> %r.val, ptr addrspace(1) %r
68+
ret void
69+
}
70+
71+
define amdgpu_kernel void @fma_v2f16_uniform(
72+
; GFX11-LABEL: name: fma_v2f16_uniform
73+
; GFX11: bb.0 (%ir-block.0):
74+
; GFX11-NEXT: liveins: $sgpr4_sgpr5
75+
; GFX11-NEXT: {{ $}}
76+
; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
77+
; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.r.kernarg.offset, align 4, addrspace 4)
78+
; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
79+
; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
80+
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
81+
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
82+
; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
83+
; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
84+
; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
85+
; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
86+
; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
87+
; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
88+
; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
89+
; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
90+
; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1
91+
; GFX11-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16) from %ir.3, addrspace 1)
92+
; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]]
93+
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[COPY9]]
94+
; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE2]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.4, addrspace 1)
95+
; GFX11-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE3]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.5, addrspace 1)
96+
; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[S_LOAD_DWORD_IMM1]]
97+
; GFX11-NEXT: [[V_PK_FMA_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 0, killed [[S_MOV_B32_]], 8, killed [[S_LOAD_DWORD_IMM]], 8, [[COPY10]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
98+
; GFX11-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_PK_FMA_F16_]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
99+
; GFX11-NEXT: S_ENDPGM 0
100+
ptr addrspace(1) %r,
101+
ptr addrspace(1) %a,
102+
ptr addrspace(1) %b,
103+
ptr addrspace(1) %c) {
104+
%a.half = load half, ptr addrspace(1) %a
105+
%vec = insertelement <2 x half> poison, half %a.half, i32 0
106+
%a.val = insertelement <2 x half> %vec, half %a.half, i32 1
107+
%b.v = load i32, ptr addrspace(1) %b
108+
%b.val = bitcast i32 %b.v to <2 x half>
109+
%c.v = load i32, ptr addrspace(1) %c
110+
%c.val = bitcast i32 %c.v to <2 x half>
111+
%r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
112+
store <2 x half> %r.val, ptr addrspace(1) %r
113+
ret void
114+
}

0 commit comments

Comments
 (0)