Skip to content

Commit 506a798

Browse files
committed
address comment
1 parent 5d5dbdd commit 506a798

File tree

2 files changed

+135
-16
lines changed

2 files changed

+135
-16
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1899,11 +1899,9 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
18991899
if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
19001900
return false;
19011901

1902-
MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1903-
MachineInstr *OrigDef = Def;
19041902
// Look through COPY. COPY only observed with True16.
1905-
if (Def->isCopy() && Def->getOperand(1).getReg().isVirtual())
1906-
Def = MRI->getVRegDef(Def->getOperand(1).getReg());
1903+
MachineOperand *DefSrc = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
1904+
MachineInstr *Def = MRI->getVRegDef(DefSrc && DefSrc->isReg() ? DefSrc->getReg() : ClampSrc->getReg());
19071905

19081906
// The type of clamp must be compatible.
19091907
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
@@ -1921,7 +1919,7 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
19211919
// Clamp is applied after omod, so it is OK if omod is set.
19221920
DefClamp->setImm(1);
19231921

1924-
Register DefReg = OrigDef->getOperand(0).getReg();
1922+
Register DefReg = Def->getOperand(0).getReg();
19251923
Register MIDstReg = MI.getOperand(0).getReg();
19261924
if (TRI->isSGPRReg(*MRI, DefReg)) {
19271925
// Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*

llvm/test/CodeGen/AMDGPU/true16-fold.mir

Lines changed: 132 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -73,18 +73,139 @@ body: |
7373
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
7474
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
7575
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
76-
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
76+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
7777
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
78-
; CHECK-NEXT: $vgpr0 = COPY [[COPY4]]
78+
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY4]], 0, [[COPY4]], -1, 0, 0, implicit $mode, implicit $exec
79+
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
7980
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
80-
%10:vgpr_32 = COPY $vgpr2
81-
%9:vgpr_32 = COPY $vgpr1
82-
%8:vgpr_32 = COPY $vgpr0
83-
%12:sreg_32 = IMPLICIT_DEF
84-
%13:vgpr_32 = COPY %12:sreg_32
85-
%11:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %8:vgpr_32, 8, %9:vgpr_32, 0, %10:vgpr_32, 0, %13:vgpr_32, 0, 0, implicit $mode, implicit $exec
86-
%15:vgpr_16 = COPY %11:vgpr_32
87-
%14:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %15:vgpr_16, 0, %15:vgpr_16, -1, 0, 0, implicit $mode, implicit $exec
88-
$vgpr0 = COPY %14:vgpr_16
81+
%0:vgpr_32 = COPY $vgpr2
82+
%1:vgpr_32 = COPY $vgpr1
83+
%2:vgpr_32 = COPY $vgpr0
84+
%3:sreg_32 = IMPLICIT_DEF
85+
%4:vgpr_32 = COPY %3
86+
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
87+
%6:vgpr_16 = COPY %5
88+
%7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec
89+
$vgpr0 = COPY %7
90+
S_ENDPGM 0, implicit $vgpr0
91+
...
92+
93+
---
94+
name: fold_16bit_subreg_folded_clamp
95+
tracksRegLiveness: true
96+
registers:
97+
body: |
98+
bb.0:
99+
liveins: $vgpr0, $vgpr1, $vgpr2
100+
; CHECK-LABEL: name: fold_16bit_madmix_clamp
101+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
102+
; CHECK-NEXT: {{ $}}
103+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
104+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
105+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
106+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
107+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
108+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
109+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
110+
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY4]], 0, [[COPY4]], -1, 0, 0, implicit $mode, implicit $exec
111+
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
112+
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
113+
%0:vgpr_32 = COPY $vgpr2
114+
%1:vgpr_32 = COPY $vgpr1
115+
%2:vgpr_32 = COPY $vgpr0
116+
%3:sreg_32 = IMPLICIT_DEF
117+
%4:vgpr_32 = COPY %3
118+
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
119+
%6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %5.lo16, 0, %5.lo16, -1, 0, 0, implicit $mode, implicit $exec
120+
$vgpr0 = COPY %6
121+
S_ENDPGM 0, implicit $vgpr0
122+
...
123+
124+
---
125+
name: fold_16bit_subreg_clamp
126+
tracksRegLiveness: true
127+
registers:
128+
body: |
129+
bb.0:
130+
liveins: $vgpr0, $vgpr1, $vgpr2
131+
; CHECK-LABEL: name: fold_16bit_subreg_clamp
132+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
133+
; CHECK-NEXT: {{ $}}
134+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
135+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
136+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
137+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
138+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
139+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
140+
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec
141+
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
142+
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
143+
%0:vgpr_32 = COPY $vgpr2
144+
%1:vgpr_32 = COPY $vgpr1
145+
%2:vgpr_32 = COPY $vgpr0
146+
%3:sreg_32 = IMPLICIT_DEF
147+
%4:vgpr_32 = COPY %3
148+
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
149+
%6:vgpr_16 = COPY %5.lo16
150+
%7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec
151+
$vgpr0 = COPY %7
152+
S_ENDPGM 0, implicit $vgpr0
153+
...
154+
155+
---
156+
name: fold_16bit_phyreg_clamp
157+
tracksRegLiveness: true
158+
registers:
159+
body: |
160+
bb.0:
161+
liveins: $vgpr0, $vgpr1, $vgpr2
162+
; CHECK-LABEL: name: fold_16bit_phyreg_clamp
163+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
164+
; CHECK-NEXT: {{ $}}
165+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
166+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
167+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
168+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
169+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
170+
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
171+
; CHECK-NEXT: $vgpr10_lo16 = COPY [[V_FMA_MIXLO_F16_]]
172+
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec
173+
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
174+
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
175+
%0:vgpr_32 = COPY $vgpr2
176+
%1:vgpr_32 = COPY $vgpr1
177+
%2:vgpr_32 = COPY $vgpr0
178+
%3:sreg_32 = IMPLICIT_DEF
179+
%4:vgpr_32 = COPY %3
180+
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
181+
$vgpr10_lo16 = COPY %5
182+
%6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec
183+
$vgpr0 = COPY %6
184+
S_ENDPGM 0, implicit $vgpr0
185+
...
186+
187+
---
188+
name: fold_16bit_undef_clamp
189+
tracksRegLiveness: true
190+
registers:
191+
body: |
192+
bb.0:
193+
liveins: $vgpr0, $vgpr1, $vgpr2
194+
; CHECK-LABEL: name: fold_16bit_undef_clamp
195+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
196+
; CHECK-NEXT: {{ $}}
197+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
198+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
199+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
200+
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
201+
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[DEF]], 0, [[DEF]], -1, 0, 0, implicit $mode, implicit $exec
202+
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
203+
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
204+
%0:vgpr_32 = COPY $vgpr2
205+
%1:vgpr_32 = COPY $vgpr1
206+
%2:vgpr_32 = COPY $vgpr0
207+
%3:vgpr_16 = IMPLICIT_DEF
208+
%4:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %3, 0, %3, -1, 0, 0, implicit $mode, implicit $exec
209+
$vgpr0 = COPY %4
89210
S_ENDPGM 0, implicit $vgpr0
90211
...

0 commit comments

Comments
 (0)