-
Notifications
You must be signed in to change notification settings - Fork 15.3k
release/20.x: AMDGPU: Widen f16 minimum/maximum to v2f16 on gfx950 (#128121) #128132
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@shiltian What do you think about merging this PR to the release branch? |
|
@llvm/pr-subscribers-backend-amdgpu Author: None (llvmbot) ChangesBackport e729dc7 Requested by: @arsenm Patch is 81.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128132.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b632c50dae0e3..a969d38e1440a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -869,8 +869,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMinimum3Maximum3F32())
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
- if (Subtarget->hasMinimum3Maximum3PKF16())
+ if (Subtarget->hasMinimum3Maximum3PKF16()) {
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
+
+ // If only the vector form is available, we need to widen to a vector.
+ if (!Subtarget->hasMinimum3Maximum3F16())
+ setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
+ }
}
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
@@ -5963,6 +5968,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMINNUM:
case ISD::FMAXNUM:
return lowerFMINNUM_FMAXNUM(Op, DAG);
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
+ return lowerFMINIMUM_FMAXIMUM(Op, DAG);
case ISD::FLDEXP:
case ISD::STRICT_FLDEXP:
return lowerFLDEXP(Op, DAG);
@@ -5984,8 +5992,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
- case ISD::FMINIMUM:
- case ISD::FMAXIMUM:
case ISD::FMINIMUMNUM:
case ISD::FMAXIMUMNUM:
case ISD::UADDSAT:
@@ -6840,6 +6846,34 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
return Op;
}
+SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ if (VT.isVector())
+ return splitBinaryVectorOp(Op, DAG);
+
+ assert(!Subtarget->hasIEEEMinMax() && !Subtarget->hasMinimum3Maximum3F16() &&
+ Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
+ "should not need to widen f16 minimum/maximum to v2f16");
+
+ // Widen f16 operation to v2f16
+
+ // fminimum f16:x, f16:y ->
+ // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
+ // (v2f16 (scalar_to_vector y))), 0
+ SDLoc SL(Op);
+ SDValue WideSrc0 =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
+ SDValue WideSrc1 =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
+
+ SDValue Widened =
+ DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
+ DAG.getConstant(0, SL, MVT::i32));
+}
+
SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1cd7f1b29e077..9b2c14862407a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -146,6 +146,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index f0fa621e3b4bc..6724c37605eb4 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1251,19 +1251,27 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
ret half %max1
@@ -1280,19 +1288,27 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v2, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_commute:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v2, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_commute:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v2, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_commute:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %c, half %max0)
ret half %max1
@@ -1309,22 +1325,34 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
; GFX12-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_fmaximum3_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_max_f16_e32 v1, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, s2, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
+; GFX942-LABEL: s_fmaximum3_f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NEXT: v_max_f16_e32 v1, s0, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, s2, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s0, v0
+; GFX942-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_fmaximum3_f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s2, s2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: ; return to shader part epilog
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
%cast = bitcast half %max1 to i16
@@ -1344,19 +1372,28 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, |v0|, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, |v0|, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs0:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, |v0|, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%max0 = call half @llvm.maximum.f16(half %a.fabs, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1374,19 +1411,28 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, |v1|, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, v0, |v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs1:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, v0, |v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs1:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%b.fabs = call half @llvm.fabs.f16(half %b)
%max0 = call half @llvm.maximum.f16(half %a, half %b.fabs)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1404,19 +1450,28 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, |v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs2:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, |v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%c.fabs = call half @llvm.fabs.f16(half %c)
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs)
@@ -1434,19 +1489,30 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, |v0|, |v1|, |v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, |v0|, |v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, |v0|, |v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, |v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
%c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1466,19 +1532,30 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, -v0, -v1, -v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fneg_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, -v0, -v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, -v0, -v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, -v2
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg half %a
%b.fneg = fneg half %b
%c.fneg = fneg half %c
@@ -1498,19 +1575,30 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fneg_fabs_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, -|v0|, -|v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, -|v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, -|v0|, -|v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, -|v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
%c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1533,19 +1621,28 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, -v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fneg0:...
[truncated]
|
|
I also just noticed the AMDGPU release notes got lost, this was a missing piece of the gfx950 line item |
Unfortunately we only have the vector versions of v2f16 minimum3 and maximum. Widen to v2f16 so we can lower as minimum333(x, y, y). (cherry picked from commit e729dc7)
|
@arsenm (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. |
Backport e729dc7
Requested by: @arsenm