Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1899,7 +1899,13 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
return false;

MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
if (!ClampSrc->getReg().isVirtual())
return false;

// Look through COPY. COPY only observed with True16.
Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
MachineInstr *Def =
MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());

// The type of clamp must be compatible.
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
Expand Down
29 changes: 7 additions & 22 deletions llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -269,19 +269,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2
}

define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; SDAG-GFX1100-FAKE16: ; %bb.0:
; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GFX900: ; %bb.0:
Expand Down Expand Up @@ -312,12 +304,6 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GISEL-GFX1100: ; %bb.0:
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
Expand Down Expand Up @@ -1524,10 +1510,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
Expand Down
149 changes: 149 additions & 0 deletions llvm/test/CodeGen/AMDGPU/true16-fold.mir
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,153 @@ body: |
%4:vgpr_16 = COPY %3:sgpr_lo16
%5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec
S_ENDPGM 0, implicit %5

---
name: fold_16bit_madmix_clamp
tracksRegLiveness: true
registers:
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-LABEL: name: fold_16bit_madmix_clamp
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIXLO_F16_]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
%0:vgpr_32 = COPY $vgpr2
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = COPY $vgpr0
%3:sreg_32 = IMPLICIT_DEF
%4:vgpr_32 = COPY %3
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
%6:vgpr_16 = COPY %5
%7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec
$vgpr0 = COPY %7
S_ENDPGM 0, implicit $vgpr0
...

---
name: fold_16bit_subreg_1_clamp
tracksRegLiveness: true
registers:
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-LABEL: name: fold_16bit_subreg_1_clamp
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
%0:vgpr_32 = COPY $vgpr2
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = COPY $vgpr0
%3:sreg_32 = IMPLICIT_DEF
%4:vgpr_32 = COPY %3
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
%6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %5.lo16, 0, %5.lo16, -1, 0, 0, implicit $mode, implicit $exec
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to make this case and the other subreg case fold, as this is a more correct MIR than in fold_16bit_madmix_clamp (COPY of different size src and dst in that one)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can keep it like this as we should not hit this case anyway.

We currently have the problem of different size COPY and thus we actually hit the case in line 5. We will need to support the true16 version of V_FMA_MIXLO_F16 to fix the COPY issue and thus we will have vpgr16 return without subreg.

$vgpr0 = COPY %6
S_ENDPGM 0, implicit $vgpr0
...

---
name: fold_16bit_subreg_2_clamp
tracksRegLiveness: true
registers:
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-LABEL: name: fold_16bit_subreg_2_clamp
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
%0:vgpr_32 = COPY $vgpr2
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = COPY $vgpr0
%3:sreg_32 = IMPLICIT_DEF
%4:vgpr_32 = COPY %3
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
%6:vgpr_16 = COPY %5.lo16
%7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec
$vgpr0 = COPY %7
S_ENDPGM 0, implicit $vgpr0
...

---
name: fold_16bit_phyreg_clamp
tracksRegLiveness: true
registers:
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-LABEL: name: fold_16bit_phyreg_clamp
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: $vgpr10_lo16 = COPY [[V_FMA_MIXLO_F16_]]
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
%0:vgpr_32 = COPY $vgpr2
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = COPY $vgpr0
%3:sreg_32 = IMPLICIT_DEF
%4:vgpr_32 = COPY %3
%5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
$vgpr10_lo16 = COPY %5
%6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec
$vgpr0 = COPY %6
S_ENDPGM 0, implicit $vgpr0
...

---
name: fold_16bit_undef_clamp
tracksRegLiveness: true
registers:
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-LABEL: name: fold_16bit_undef_clamp
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[DEF]], 0, [[DEF]], -1, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
%0:vgpr_32 = COPY $vgpr2
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = COPY $vgpr0
%3:vgpr_16 = IMPLICIT_DEF
%4:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %3, 0, %3, -1, 0, 0, implicit $mode, implicit $exec
$vgpr0 = COPY %4
S_ENDPGM 0, implicit $vgpr0
...
Loading