From 5d5dbdd9884ee9340865c8c1c5adeaaf1a938d87 Mon Sep 17 00:00:00 2001 From: guochen2 Date: Wed, 26 Feb 2025 12:31:48 -0500 Subject: [PATCH 1/4] true16 for fold clamp --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 6 ++++- llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 29 ++++++---------------- llvm/test/CodeGen/AMDGPU/true16-fold.mir | 30 +++++++++++++++++++++++ 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 2bfc37b68a2ec..54a7640346c95 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1900,6 +1900,10 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) { return false; MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); + MachineInstr *OrigDef = Def; + // Look through COPY. COPY only observed with True16. + if (Def->isCopy() && Def->getOperand(1).getReg().isVirtual()) + Def = MRI->getVRegDef(Def->getOperand(1).getReg()); // The type of clamp must be compatible. if (TII->getClampMask(*Def) != TII->getClampMask(MI)) @@ -1917,7 +1921,7 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) { // Clamp is applied after omod, so it is OK if omod is set. DefClamp->setImm(1); - Register DefReg = Def->getOperand(0).getReg(); + Register DefReg = OrigDef->getOperand(0).getReg(); Register MIDstReg = MI.getOperand(0).getReg(); if (TRI->isSGPRReg(*MRI, DefReg)) { // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max* diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 15cb404a3840a..beac41e42e0c6 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -269,19 +269,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2 } define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 { -; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; SDAG-GFX1100-TRUE16: ; %bb.0: -; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp -; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; SDAG-GFX1100-FAKE16: ; %bb.0: -; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; GFX900: ; %bb.0: @@ -312,12 +304,6 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; GISEL-GFX1100: ; %bb.0: -; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1524,10 +1510,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir index ef6e4007b8f7a..e666237290f57 100644 --- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -57,4 +57,34 @@ body: | %4:vgpr_16 = COPY %3:sgpr_lo16 %5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec S_ENDPGM 0, implicit %5 + +--- +name: fold_16bit_madmix_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_madmix_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]] + ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %10:vgpr_32 = COPY $vgpr2 + %9:vgpr_32 = COPY $vgpr1 + %8:vgpr_32 = COPY $vgpr0 + %12:sreg_32 = IMPLICIT_DEF + %13:vgpr_32 = COPY %12:sreg_32 + %11:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %8:vgpr_32, 8, %9:vgpr_32, 0, %10:vgpr_32, 0, %13:vgpr_32, 0, 0, implicit $mode, implicit $exec + %15:vgpr_16 = COPY %11:vgpr_32 + %14:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %15:vgpr_16, 0, %15:vgpr_16, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %14:vgpr_16 + S_ENDPGM 0, implicit $vgpr0 ... From 506a7989da2d78ee8bdad1164172e3505bafc20b Mon Sep 17 00:00:00 2001 From: guochen2 Date: Sat, 8 Mar 2025 19:40:57 -0500 Subject: [PATCH 2/4] address comment --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 8 +- llvm/test/CodeGen/AMDGPU/true16-fold.mir | 143 ++++++++++++++++++++-- 2 files changed, 135 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 54a7640346c95..9a4267a099650 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1899,11 +1899,9 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) { if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg())) return false; - MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); - MachineInstr *OrigDef = Def; // Look through COPY. COPY only observed with True16. - if (Def->isCopy() && Def->getOperand(1).getReg().isVirtual()) - Def = MRI->getVRegDef(Def->getOperand(1).getReg()); + MachineOperand *DefSrc = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI); + MachineInstr *Def = MRI->getVRegDef(DefSrc && DefSrc->isReg() ? DefSrc->getReg() : ClampSrc->getReg()); // The type of clamp must be compatible. if (TII->getClampMask(*Def) != TII->getClampMask(MI)) @@ -1921,7 +1919,7 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) { // Clamp is applied after omod, so it is OK if omod is set. DefClamp->setImm(1); - Register DefReg = OrigDef->getOperand(0).getReg(); + Register DefReg = Def->getOperand(0).getReg(); Register MIDstReg = MI.getOperand(0).getReg(); if (TRI->isSGPRReg(*MRI, DefReg)) { // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max* diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir index e666237290f57..9ce612677334a 100644 --- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -73,18 +73,139 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] - ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]] - ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]] + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY4]], 0, [[COPY4]], -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 - %10:vgpr_32 = COPY $vgpr2 - %9:vgpr_32 = COPY $vgpr1 - %8:vgpr_32 = COPY $vgpr0 - %12:sreg_32 = IMPLICIT_DEF - %13:vgpr_32 = COPY %12:sreg_32 - %11:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %8:vgpr_32, 8, %9:vgpr_32, 0, %10:vgpr_32, 0, %13:vgpr_32, 0, 0, implicit $mode, implicit $exec - %15:vgpr_16 = COPY %11:vgpr_32 - %14:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %15:vgpr_16, 0, %15:vgpr_16, -1, 0, 0, implicit $mode, implicit $exec - $vgpr0 = COPY %14:vgpr_16 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + %6:vgpr_16 = COPY %5 + %7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %7 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_subreg_folded_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_madmix_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]] + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY4]], 0, [[COPY4]], -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + %6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %5.lo16, 0, %5.lo16, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %6 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_subreg_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_subreg_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + %6:vgpr_16 = COPY %5.lo16 + %7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %7 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_phyreg_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_phyreg_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr10_lo16 = COPY [[V_FMA_MIXLO_F16_]] + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + $vgpr10_lo16 = COPY %5 + %6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %6 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_undef_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_undef_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[DEF]], 0, [[DEF]], -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:vgpr_16 = IMPLICIT_DEF + %4:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %3, 0, %3, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %4 S_ENDPGM 0, implicit $vgpr0 ... From 50498a6e0b469ec320bb4fbbbdf9bea33af86739 Mon Sep 17 00:00:00 2001 From: guochen2 Date: Thu, 13 Mar 2025 17:22:43 -0400 Subject: [PATCH 3/4] address comment --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 5 +++-- llvm/test/CodeGen/AMDGPU/true16-fold.mir | 21 +++++++++------------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 9a4267a099650..dd0b5b7a2db34 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1900,8 +1900,9 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) { return false; // Look through COPY. COPY only observed with True16. - MachineOperand *DefSrc = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI); - MachineInstr *Def = MRI->getVRegDef(DefSrc && DefSrc->isReg() ? DefSrc->getReg() : ClampSrc->getReg()); + MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, ClampSrc->getReg()); + MachineInstr *Def = MRI->getVRegDef( + DefSrc && DefSrc->isReg() ? DefSrc->getReg() : ClampSrc->getReg()); // The type of clamp must be compatible. if (TII->getClampMask(*Def) != TII->getClampMask(MI)) diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir index 9ce612677334a..653fe48d67bad 100644 --- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -73,10 +73,9 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] - ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]] - ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY4]], 0, [[COPY4]], -1, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIXLO_F16_]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = COPY $vgpr2 %1:vgpr_32 = COPY $vgpr1 @@ -91,13 +90,13 @@ body: | ... --- -name: fold_16bit_subreg_folded_clamp +name: fold_16bit_subreg_1_clamp tracksRegLiveness: true registers: body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 - ; CHECK-LABEL: name: fold_16bit_madmix_clamp + ; CHECK-LABEL: name: fold_16bit_subreg_1_clamp ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 @@ -106,8 +105,7 @@ body: | ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]] - ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY4]], 0, [[COPY4]], -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = COPY $vgpr2 @@ -122,13 +120,13 @@ body: | ... --- -name: fold_16bit_subreg_clamp +name: fold_16bit_subreg_2_clamp tracksRegLiveness: true registers: body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2 - ; CHECK-LABEL: name: fold_16bit_subreg_clamp + ; CHECK-LABEL: name: fold_16bit_subreg_2_clamp ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 @@ -167,10 +165,9 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] - ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: $vgpr10_lo16 = COPY [[V_FMA_MIXLO_F16_]] - ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIXLO_F16_]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = COPY $vgpr2 %1:vgpr_32 = COPY $vgpr1 From a7d130c78fd84ecbb4e5371150ab4f062070b30e Mon Sep 17 00:00:00 2001 From: guochen2 Date: Wed, 26 Mar 2025 16:43:20 -0400 Subject: [PATCH 4/4] use lookthrucopylike --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 9 ++++++--- llvm/test/CodeGen/AMDGPU/true16-fold.mir | 5 +++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index dd0b5b7a2db34..d6acf9e081b9f 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1899,10 +1899,13 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) { if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg())) return false; + if (!ClampSrc->getReg().isVirtual()) + return false; + // Look through COPY. COPY only observed with True16. - MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, ClampSrc->getReg()); - MachineInstr *Def = MRI->getVRegDef( - DefSrc && DefSrc->isReg() ? DefSrc->getReg() : ClampSrc->getReg()); + Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI); + MachineInstr *Def = + MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg()); // The type of clamp must be compatible. if (TII->getClampMask(*Def) != TII->getClampMask(MI)) diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir index 653fe48d67bad..93cc12f152cca 100644 --- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -165,9 +165,10 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] - ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: $vgpr10_lo16 = COPY [[V_FMA_MIXLO_F16_]] - ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIXLO_F16_]] + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = COPY $vgpr2 %1:vgpr_32 = COPY $vgpr1