-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU][GlobalISel] Lower G_FMINIMUM and G_FMAXIMUM #151122
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Mirko Brkušanin (mbrkusanin) ChangesAdd GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same Patch is 128.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151122.diff 7 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index ea0873f41ebba..e100a2c69ffbf 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -456,6 +456,7 @@ class LegalizerHelper {
LLVM_ABI LegalizeResult lowerMinMax(MachineInstr &MI);
LLVM_ABI LegalizeResult lowerFCopySign(MachineInstr &MI);
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
+ LLVM_ABI LegalizeResult lowerFMinimumMaximum(MachineInstr &MI);
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI);
LLVM_ABI LegalizeResult lowerIntrinsicRound(MachineInstr &MI);
LLVM_ABI LegalizeResult lowerFFloor(MachineInstr &MI);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 538a763f9f48e..3e6df88fd1c77 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4675,6 +4675,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
case G_FMINIMUMNUM:
case G_FMAXIMUMNUM:
return lowerFMinNumMaxNum(MI);
+ case G_FMINIMUM:
+ case G_FMAXIMUM:
+ return lowerFMinimumMaximum(MI);
case G_MERGE_VALUES:
return lowerMergeValues(MI);
case G_UNMERGE_VALUES:
@@ -8294,6 +8297,75 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
return Legalized;
}
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFMinimumMaximum(MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+ LLT Ty = MRI.getType(Dst);
+ LLT CmpTy =
+ Ty.isScalar() ? LLT::scalar(1) : LLT::vector(Ty.getElementCount(), 1);
+
+ bool IsMax = (Opc == TargetOpcode::G_FMAXIMUM);
+ unsigned OpcIeee =
+ IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE;
+ unsigned OpcNonIeee =
+ IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM;
+ bool MinMaxMustRespectOrderedZero = false;
+ Register Res;
+
+ // IEEE variants don't need canonicalization
+ if (LI.isLegalOrCustom({OpcIeee, Ty})) {
+ Res = MIRBuilder.buildInstr(OpcIeee, {Ty}, {Src0, Src1}).getReg(0);
+ MinMaxMustRespectOrderedZero = true;
+ } else if (LI.isLegalOrCustom({OpcNonIeee, Ty})) {
+ Res = MIRBuilder.buildInstr(OpcNonIeee, {Ty}, {Src0, Src1}).getReg(0);
+ } else {
+ auto Compare = MIRBuilder.buildFCmp(
+ IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT, CmpTy, Src0, Src1);
+ Res = MIRBuilder.buildSelect(Ty, Compare, Src0, Src1).getReg(0);
+ }
+
+ // Propagate any NaN of both operands
+ if (!MI.getFlag(MachineInstr::FmNoNans) &&
+ (!isKnownNeverNaN(Src0, MRI) || isKnownNeverNaN(Src1, MRI))) {
+ auto IsOrdered = MIRBuilder.buildFCmp(CmpInst::FCMP_ORD, CmpTy, Src0, Src1);
+
+ LLT ElementTy = Ty.isScalar() ? Ty : Ty.getElementType();
+ APFloat NaNValue = APFloat::getNaN(getFltSemanticForLLT(ElementTy));
+ Register NaN = MIRBuilder.buildFConstant(ElementTy, NaNValue).getReg(0);
+ if (Ty.isVector())
+ NaN = MIRBuilder.buildSplatBuildVector(Ty, NaN).getReg(0);
+
+ Res = MIRBuilder.buildSelect(Ty, IsOrdered, Res, NaN).getReg(0);
+ }
+
+ // fminimum/fmaximum requires -0.0 less than +0.0
+ if (!MinMaxMustRespectOrderedZero && !MI.getFlag(MachineInstr::FmNsz)) {
+ GISelValueTracking VT(MIRBuilder.getMF());
+ KnownFPClass Src0Info = VT.computeKnownFPClass(Src0, fcZero);
+ KnownFPClass Src1Info = VT.computeKnownFPClass(Src1, fcZero);
+
+ if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) {
+ Register Zero = MIRBuilder.buildFConstant(Ty, 0.0).getReg(0);
+ auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_OEQ, CmpTy, Res, Zero);
+
+ unsigned TestClass = IsMax ? fcPosZero : fcNegZero;
+
+ auto LHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src0, TestClass);
+ auto LHSSelect = MIRBuilder.buildSelect(Ty, LHSTestZero, Src0, Res);
+
+ auto RHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src1, TestClass);
+ auto RHSSelect = MIRBuilder.buildSelect(Ty, RHSTestZero, Src1, LHSSelect);
+
+ Res = MIRBuilder.buildSelect(Ty, IsZero, RHSSelect, Res).getReg(0);
+ }
+ }
+
+ MIRBuilder.buildCopy(Dst, Res);
+ MI.eraseFromParent();
+ return Legalized;
+}
+
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
// Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fedfa3f9dd900..2dfa07ca9cfe2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -957,9 +957,27 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
}
+ auto &MinNumMaxNumIeee = getActionDefinitionsBuilder(
+ {G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
+
+ if (ST.hasVOP3PInsts()) {
+ MinNumMaxNumIeee.legalFor(FPTypesPK16)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .clampMaxNumElements(0, S16, 2)
+ .clampScalar(0, S16, S64)
+ .scalarize(0);
+ } else if (ST.has16BitInsts()) {
+ MinNumMaxNumIeee.legalFor(FPTypes16)
+ .clampScalar(0, S16, S64)
+ .scalarize(0);
+ } else {
+ MinNumMaxNumIeee.legalFor(FPTypesBase)
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
+ }
+
auto &MinNumMaxNum = getActionDefinitionsBuilder(
- {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
- G_FMAXNUM_IEEE});
+ {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
if (ST.hasVOP3PInsts()) {
MinNumMaxNum.customFor(FPTypesPK16)
@@ -2100,9 +2118,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalFor(FPTypesPK16)
.clampMaxNumElements(0, S16, 2)
.scalarize(0);
+ } else if (ST.hasVOP3PInsts()){
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+ .lowerFor({V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .lower();
} else {
- // TODO: Implement
- getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+ .scalarize(0)
+ .clampScalar(0, S32, S64)
+ .lower();
}
getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
@@ -2159,8 +2185,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
case TargetOpcode::G_FMAXNUM:
case TargetOpcode::G_FMINIMUMNUM:
case TargetOpcode::G_FMAXIMUMNUM:
- case TargetOpcode::G_FMINNUM_IEEE:
- case TargetOpcode::G_FMAXNUM_IEEE:
return legalizeMinNumMaxNum(Helper, MI);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return legalizeExtractVectorElt(MI, MRI, B);
@@ -2734,23 +2758,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
MachineFunction &MF = Helper.MIRBuilder.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
- MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
-
- // With ieee_mode disabled, the instructions have the correct behavior
- // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
- //
- // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
- // enabled.
- if (!MFI->getMode().IEEE) {
- if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
- MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
- return true;
-
- return !IsIEEEOp;
- }
-
- if (IsIEEEOp)
+ // With ieee_mode disabled, the instructions have the correct behavior.
+ if (!MFI->getMode().IEEE)
return true;
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
new file mode 100644
index 0000000000000..4b214e66ea994
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12 %s
+
+---
+name: test_fmaximum_f16
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fmaximum_f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[TRUNC]], [[TRUNC1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_f16
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[TRUNC]], [[TRUNC1]]
+ ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMAXIMUM]](s16)
+ ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s16) = G_TRUNC %0(s32)
+ %2:_(s32) = COPY $vgpr1
+ %3:_(s16) = G_TRUNC %2(s32)
+ %4:_(s16) = G_FMAXIMUM %1, %3
+ %5:_(s32) = G_ANYEXT %4(s16)
+ $vgpr0 = COPY %5(s32)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fmaximum_f32
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_f32
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = G_FMAXIMUM %0, %1
+ $vgpr0 = COPY %2(s32)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f64
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; GFX9-LABEL: name: test_fmaximum_f64
+ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s64) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+ ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_f64
+ ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMAXIMUM]](s64)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(s64) = G_FMAXIMUM %0, %1
+ $vgpr0_vgpr1 = COPY %2(s64)
+ SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fmaximum_v2f16
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fmaximum_v2f16
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+ ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+ ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+ ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+ ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE]](<2 x s16>)
+ ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+ ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_v2f16
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMAXIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](<2 x s16>)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(<2 x s16>) = COPY $vgpr0
+ %1:_(<2 x s16>) = COPY $vgpr1
+ %2:_(<2 x s16>) = G_FMAXIMUM %0, %1
+ $vgpr0 = COPY %2(<2 x s16>)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_v2f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; GFX9-LABEL: name: test_fmaximum_v2f32
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY2]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+ ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY1]], [[COPY3]]
+ ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMAXNUM_IEEE1]], [[C]]
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_v2f32
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY2]]
+ ; GFX12-NEXT: [[FMAXIMUM1:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY1]], [[COPY3]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+ ; GFX12-NEXT: $vgpr1 = COPY [[FMAXIMUM1]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+ %3:_(s32) = COPY $vgpr2
+ %4:_(s32) = COPY $vgpr3
+ %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+ %6:_(<2 x s32>) = G_FMAXIMUM %2, %5
+ %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+ $vgpr0 = COPY %7(s32)
+ $vgpr1 = COPY %8(s32)
+ SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fmaximum_nsz_f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fmaximum_nsz_f32
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_nsz_f32
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nsz G_FMAXIMUM [[COPY]], [[COPY1]]
+ ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+ ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = nsz G_FMAXIMUM %0, %1
+ $vgpr0 = COPY %2(s32)
+ SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_nnan_f32
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: test_fmaximum_nnan_f32
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMAXNUM_IEEE]](s32)
+ ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: test_fmaximum_nnan_f32
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX1...
[truncated]
|
|
This also fixes compilation failures for VulkanCTS tests for gfx11 globalisel. |
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
6b98128 to
2d11a0f
Compare
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but can you fix flag propagation in a follow up
| KnownFPClass Src0Info = VT.computeKnownFPClass(Src0, fcZero); | ||
| KnownFPClass Src1Info = VT.computeKnownFPClass(Src1, fcZero); | ||
|
|
||
| if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think there is a bug in this sequence (existing in the DAG one) but I don't remember what the status of that is
Add GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same logic as in SDag's expandFMINIMUM_FMAXIMUM. Update AMDGPU legalization rules: Pre GFX12 now uses new lowering method and make G_FMINNUM_IEEE and G_FMAXNUM_IEEE legal to match SDag.
2d11a0f to
796c135
Compare
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/154/builds/23141 Here is the relevant piece of the build log for the reference |
Add GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same logic as in SDag's expandFMINIMUM_FMAXIMUM. Update AMDGPU legalization rules: Pre GFX12 now uses new lowering method and make G_FMINNUM_IEEE and G_FMAXNUM_IEEE legal to match SDag.
Add GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same logic as in SDag's expandFMINIMUM_FMAXIMUM. Update AMDGPU legalization rules: Pre GFX12 now uses new lowering method and make G_FMINNUM_IEEE and G_FMAXNUM_IEEE legal to match SDag.
Add GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same logic as in SDag's expandFMINIMUM_FMAXIMUM. Update AMDGPU legalization rules: Pre GFX12 now uses new lowering method and make G_FMINNUM_IEEE and G_FMAXNUM_IEEE legal to match SDag.
Add GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same
logic as in SDag's expandFMINIMUM_FMAXIMUM.
Update AMDGPU legalization rules: Pre GFX12 now uses new lowering method
and make G_FMINNUM_IEEE and G_FMAXNUM_IEEE legal to match SDag.