Skip to content

Conversation

@mbrkusanin
Copy link
Collaborator

Add GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same
logic as in SDag's expandFMINIMUM_FMAXIMUM.
Update AMDGPU legalization rules: Pre GFX12 now uses new lowering method
and make G_FMINNUM_IEEE and G_FMAXNUM_IEEE legal to match SDag.

@llvmbot
Copy link
Member

llvmbot commented Jul 29, 2025

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-globalisel

Author: Mirko Brkušanin (mbrkusanin)

Changes

Add GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same
logic as in SDag's expandFMINIMUM_FMAXIMUM.
Update AMDGPU legalization rules: Pre GFX12 now uses new lowering method
and make G_FMINNUM_IEEE and G_FMAXNUM_IEEE legal to match SDag.


Patch is 128.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151122.diff

7 Files Affected:

  • (modified) llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h (+1)
  • (modified) llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (+72)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+32-23)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir (+275)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir (+275)
  • (modified) llvm/test/CodeGen/AMDGPU/fmaximum.ll (+790-131)
  • (modified) llvm/test/CodeGen/AMDGPU/fminimum.ll (+790-131)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index ea0873f41ebba..e100a2c69ffbf 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -456,6 +456,7 @@ class LegalizerHelper {
   LLVM_ABI LegalizeResult lowerMinMax(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFCopySign(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
+  LLVM_ABI LegalizeResult lowerFMinimumMaximum(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerIntrinsicRound(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFFloor(MachineInstr &MI);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 538a763f9f48e..3e6df88fd1c77 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4675,6 +4675,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
   case G_FMINIMUMNUM:
   case G_FMAXIMUMNUM:
     return lowerFMinNumMaxNum(MI);
+  case G_FMINIMUM:
+  case G_FMAXIMUM:
+    return lowerFMinimumMaximum(MI);
   case G_MERGE_VALUES:
     return lowerMergeValues(MI);
   case G_UNMERGE_VALUES:
@@ -8294,6 +8297,75 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFMinimumMaximum(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+  LLT Ty = MRI.getType(Dst);
+  LLT CmpTy =
+      Ty.isScalar() ? LLT::scalar(1) : LLT::vector(Ty.getElementCount(), 1);
+
+  bool IsMax = (Opc == TargetOpcode::G_FMAXIMUM);
+  unsigned OpcIeee =
+      IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE;
+  unsigned OpcNonIeee =
+      IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM;
+  bool MinMaxMustRespectOrderedZero = false;
+  Register Res;
+
+  // IEEE variants don't need canonicalization
+  if (LI.isLegalOrCustom({OpcIeee, Ty})) {
+    Res = MIRBuilder.buildInstr(OpcIeee, {Ty}, {Src0, Src1}).getReg(0);
+    MinMaxMustRespectOrderedZero = true;
+  } else if (LI.isLegalOrCustom({OpcNonIeee, Ty})) {
+    Res = MIRBuilder.buildInstr(OpcNonIeee, {Ty}, {Src0, Src1}).getReg(0);
+  } else {
+    auto Compare = MIRBuilder.buildFCmp(
+        IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT, CmpTy, Src0, Src1);
+    Res = MIRBuilder.buildSelect(Ty, Compare, Src0, Src1).getReg(0);
+  }
+
+  // Propagate any NaN of both operands
+  if (!MI.getFlag(MachineInstr::FmNoNans) &&
+      (!isKnownNeverNaN(Src0, MRI) || isKnownNeverNaN(Src1, MRI))) {
+    auto IsOrdered = MIRBuilder.buildFCmp(CmpInst::FCMP_ORD, CmpTy, Src0, Src1);
+
+    LLT ElementTy = Ty.isScalar() ? Ty : Ty.getElementType();
+    APFloat NaNValue = APFloat::getNaN(getFltSemanticForLLT(ElementTy));
+    Register NaN = MIRBuilder.buildFConstant(ElementTy, NaNValue).getReg(0);
+    if (Ty.isVector())
+      NaN = MIRBuilder.buildSplatBuildVector(Ty, NaN).getReg(0);
+
+    Res = MIRBuilder.buildSelect(Ty, IsOrdered, Res, NaN).getReg(0);
+  }
+
+  // fminimum/fmaximum requires -0.0 less than +0.0
+  if (!MinMaxMustRespectOrderedZero && !MI.getFlag(MachineInstr::FmNsz)) {
+    GISelValueTracking VT(MIRBuilder.getMF());
+    KnownFPClass Src0Info = VT.computeKnownFPClass(Src0, fcZero);
+    KnownFPClass Src1Info = VT.computeKnownFPClass(Src1, fcZero);
+
+    if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) {
+      Register Zero = MIRBuilder.buildFConstant(Ty, 0.0).getReg(0);
+      auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_OEQ, CmpTy, Res, Zero);
+
+      unsigned TestClass = IsMax ? fcPosZero : fcNegZero;
+
+      auto LHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src0, TestClass);
+      auto LHSSelect = MIRBuilder.buildSelect(Ty, LHSTestZero, Src0, Res);
+
+      auto RHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src1, TestClass);
+      auto RHSSelect = MIRBuilder.buildSelect(Ty, RHSTestZero, Src1, LHSSelect);
+
+      Res = MIRBuilder.buildSelect(Ty, IsZero, RHSSelect, Res).getReg(0);
+    }
+  }
+
+  MIRBuilder.buildCopy(Dst, Res);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
   Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fedfa3f9dd900..2dfa07ca9cfe2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -957,9 +957,27 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
   }
 
+  auto &MinNumMaxNumIeee = getActionDefinitionsBuilder(
+      {G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
+
+  if (ST.hasVOP3PInsts()) {
+    MinNumMaxNumIeee.legalFor(FPTypesPK16)
+      .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+      .clampMaxNumElements(0, S16, 2)
+      .clampScalar(0, S16, S64)
+      .scalarize(0);
+  } else if (ST.has16BitInsts()) {
+    MinNumMaxNumIeee.legalFor(FPTypes16)
+      .clampScalar(0, S16, S64)
+      .scalarize(0);
+  } else {
+    MinNumMaxNumIeee.legalFor(FPTypesBase)
+      .clampScalar(0, S32, S64)
+      .scalarize(0);
+  }
+
   auto &MinNumMaxNum = getActionDefinitionsBuilder(
-      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
-       G_FMAXNUM_IEEE});
+      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
 
   if (ST.hasVOP3PInsts()) {
     MinNumMaxNum.customFor(FPTypesPK16)
@@ -2100,9 +2118,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         .legalFor(FPTypesPK16)
         .clampMaxNumElements(0, S16, 2)
         .scalarize(0);
+  } else if (ST.hasVOP3PInsts()){
+    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+        .lowerFor({V2S16})
+        .clampMaxNumElementsStrict(0, S16, 2)
+        .scalarize(0)
+        .lower();
   } else {
-    // TODO: Implement
-    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+        .scalarize(0)
+        .clampScalar(0, S32, S64)
+        .lower();
   }
 
   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
@@ -2159,8 +2185,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FMINIMUMNUM:
   case TargetOpcode::G_FMAXIMUMNUM:
-  case TargetOpcode::G_FMINNUM_IEEE:
-  case TargetOpcode::G_FMAXNUM_IEEE:
     return legalizeMinNumMaxNum(Helper, MI);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return legalizeExtractVectorElt(MI, MRI, B);
@@ -2734,23 +2758,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
   MachineFunction &MF = Helper.MIRBuilder.getMF();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
-                        MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
-
-  // With ieee_mode disabled, the instructions have the correct behavior
-  // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
-  //
-  // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
-  // enabled.
-  if (!MFI->getMode().IEEE) {
-    if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
-        MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
-      return true;
-
-    return !IsIEEEOp;
-  }
-
-  if (IsIEEEOp)
+  // With ieee_mode disabled, the instructions have the correct behavior.
+  if (!MFI->getMode().IEEE)
     return true;
 
   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
new file mode 100644
index 0000000000000..4b214e66ea994
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12  %s
+
+---
+name: test_fmaximum_f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[TRUNC]], [[TRUNC1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[TRUNC]], [[TRUNC1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMAXIMUM]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s16) = G_TRUNC %2(s32)
+    %4:_(s16) = G_FMAXIMUM %1, %3
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $vgpr0 = COPY %5(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; GFX9-LABEL: name: test_fmaximum_f64
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s64) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMAXIMUM]](s64)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FMAXIMUM %0, %1
+    $vgpr0_vgpr1 = COPY %2(s64)
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fmaximum_v2f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_v2f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_v2f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](<2 x s16>)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(<2 x s16>)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_v2f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; GFX9-LABEL: name: test_fmaximum_v2f32
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY2]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY1]], [[COPY3]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMAXNUM_IEEE1]], [[C]]
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_v2f32
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY2]]
+    ; GFX12-NEXT: [[FMAXIMUM1:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY1]], [[COPY3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: $vgpr1 = COPY [[FMAXIMUM1]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+    %6:_(<2 x s32>) = G_FMAXIMUM %2, %5
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+    $vgpr0 = COPY %7(s32)
+    $vgpr1 = COPY %8(s32)
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fmaximum_nsz_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_nsz_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_nsz_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nsz G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nsz G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_nnan_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_nnan_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMAXNUM_IEEE]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_nnan_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX1...
[truncated]

@mbrkusanin
Copy link
Collaborator Author

This also fixes compilation failures for VulkanCTS tests for gfx11 globalisel.

@github-actions
Copy link

github-actions bot commented Jul 29, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@mbrkusanin mbrkusanin force-pushed the legalize-fminimum-fmaximum branch from 6b98128 to 2d11a0f Compare July 29, 2025 11:08
Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, but can you fix flag propagation in a follow up

KnownFPClass Src0Info = VT.computeKnownFPClass(Src0, fcZero);
KnownFPClass Src1Info = VT.computeKnownFPClass(Src1, fcZero);

if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is a bug in this sequence (existing in the DAG one) but I don't remember what the status of that is

Add GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same
logic as in SDag's expandFMINIMUM_FMAXIMUM.
Update AMDGPU legalization rules: Pre GFX12 now uses new lowering method
and make G_FMINNUM_IEEE and G_FMAXNUM_IEEE legal to match SDag.
@mbrkusanin mbrkusanin force-pushed the legalize-fminimum-fmaximum branch from 2d11a0f to 796c135 Compare October 24, 2025 11:21
@mbrkusanin mbrkusanin merged commit fe5f499 into llvm:main Oct 24, 2025
10 checks passed
@mbrkusanin mbrkusanin deleted the legalize-fminimum-fmaximum branch October 24, 2025 12:48
@llvm-ci
Copy link
Collaborator

llvm-ci commented Oct 24, 2025

LLVM Buildbot has detected a new failure on builder clang-armv8-quick running on linaro-clang-armv8-quick while building llvm at step 5 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/154/builds/23141

Here is the relevant piece of the build log for the reference
Step 5 (ninja check 1) failure: stage 1 checked (failure)
******************** TEST 'Clangd Unit Tests :: ./ClangdTests/244/335' FAILED ********************
Script(shard):
--
GTEST_OUTPUT=json:/home/tcwg-buildbot/worker/clang-armv8-quick/stage1/tools/clang/tools/extra/clangd/unittests/./ClangdTests-Clangd Unit Tests-887105-244-335.json GTEST_SHUFFLE=0 GTEST_TOTAL_SHARDS=335 GTEST_SHARD_INDEX=244 /home/tcwg-buildbot/worker/clang-armv8-quick/stage1/tools/clang/tools/extra/clangd/unittests/./ClangdTests
--

Note: This is test shard 245 of 335.
[==========] Running 4 tests from 4 test suites.
[----------] Global test environment set-up.
[----------] 1 test from CompletionStringTest
[ RUN      ] CompletionStringTest.Documentation
[       OK ] CompletionStringTest.Documentation (0 ms)
[----------] 1 test from CompletionStringTest (0 ms total)

[----------] 1 test from FuzzyMatch
[ RUN      ] FuzzyMatch.Matches
[       OK ] FuzzyMatch.Matches (47 ms)
[----------] 1 test from FuzzyMatch (47 ms total)

[----------] 1 test from CrossFileRenameTests
[ RUN      ] CrossFileRenameTests.WithUpToDateIndex
ASTWorker building file /clangd-test/foo.h version null with command 
[/clangd-test]
clang -xobjective-c++ /clangd-test/foo.h
Driver produced command: cc1 -cc1 -triple armv8a-unknown-linux-gnueabihf -fsyntax-only -disable-free -clear-ast-before-backend -main-file-name foo.h -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -target-cpu generic -target-feature +read-tp-tpidruro -target-feature +vfp2 -target-feature +vfp2sp -target-feature +vfp3 -target-feature +vfp3d16 -target-feature +vfp3d16sp -target-feature +vfp3sp -target-feature +fp16 -target-feature +vfp4 -target-feature +vfp4d16 -target-feature +vfp4d16sp -target-feature +vfp4sp -target-feature +fp-armv8 -target-feature +fp-armv8d16 -target-feature +fp-armv8d16sp -target-feature +fp-armv8sp -target-feature -fullfp16 -target-feature +fp64 -target-feature +d32 -target-feature +sha2 -target-feature +aes -target-feature -fp16fml -target-feature +neon -target-abi aapcs-linux -mfloat-abi hard -debugger-tuning=gdb -fdebug-compilation-dir=/clangd-test -fcoverage-compilation-dir=/clangd-test -resource-dir lib/clang/22 -internal-isystem lib/clang/22/include -internal-isystem /usr/local/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -fdeprecated-macro -ferror-limit 19 -fno-signed-char -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fobjc-runtime=gcc -fobjc-encode-cxx-class-template-spec -fobjc-exceptions -fcxx-exceptions -fexceptions -no-round-trip-args -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -x objective-c++ /clangd-test/foo.h
Building first preamble for /clangd-test/foo.h version null
Built preamble of size 421104 for file /clangd-test/foo.h version null in 0.15 seconds
indexed preamble AST for /clangd-test/foo.h version null:
  symbol slab: 0 symbols, 68 bytes
  ref slab: 0 symbols, 0 refs, 72 bytes
  relations slab: 0 relations, 12 bytes
indexed file AST for /clangd-test/foo.h version null:
  symbol slab: 3 symbols, 4584 bytes
  ref slab: 3 symbols, 5 refs, 4232 bytes
  relations slab: 0 relations, 12 bytes
Build dynamic index for main-file symbols with estimated memory usage of 11148 bytes
ASTWorker building file /clangd-test/foo.cc version null with command 
[/clangd-test]
clang -xobjective-c++ /clangd-test/foo.cc
Driver produced command: cc1 -cc1 -triple armv8a-unknown-linux-gnueabihf -fsyntax-only -disable-free -clear-ast-before-backend -main-file-name foo.cc -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -target-cpu generic -target-feature +read-tp-tpidruro -target-feature +vfp2 -target-feature +vfp2sp -target-feature +vfp3 -target-feature +vfp3d16 -target-feature +vfp3d16sp -target-feature +vfp3sp -target-feature +fp16 -target-feature +vfp4 -target-feature +vfp4d16 -target-feature +vfp4d16sp -target-feature +vfp4sp -target-feature +fp-armv8 -target-feature +fp-armv8d16 -target-feature +fp-armv8d16sp -target-feature +fp-armv8sp -target-feature -fullfp16 -target-feature +fp64 -target-feature +d32 -target-feature +sha2 -target-feature +aes -target-feature -fp16fml -target-feature +neon -target-abi aapcs-linux -mfloat-abi hard -debugger-tuning=gdb -fdebug-compilation-dir=/clangd-test -fcoverage-compilation-dir=/clangd-test -resource-dir lib/clang/22 -internal-isystem lib/clang/22/include -internal-isystem /usr/local/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -fdeprecated-macro -ferror-limit 19 -fno-signed-char -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fobjc-runtime=gcc -fobjc-encode-cxx-class-template-spec -fobjc-exceptions -fcxx-exceptions -fexceptions -no-round-trip-args -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -x objective-c++ /clangd-test/foo.cc
Building first preamble for /clangd-test/foo.cc version null
Built preamble of size 422052 for file /clangd-test/foo.cc version null in 0.42 seconds
indexed preamble AST for /clangd-test/foo.cc version null:
  symbol slab: 3 symbols, 4584 bytes
  ref slab: 0 symbols, 0 refs, 72 bytes
  relations slab: 0 relations, 12 bytes
Build dynamic index for header symbols with estimated memory usage of 6408 bytes
indexed file AST for /clangd-test/foo.cc version null:
  symbol slab: 3 symbols, 4584 bytes
  ref slab: 4 symbols, 9 refs, 4232 bytes
...

dvbuka pushed a commit to dvbuka/llvm-project that referenced this pull request Oct 27, 2025
Add GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same
logic as in SDag's expandFMINIMUM_FMAXIMUM.
Update AMDGPU legalization rules: Pre GFX12 now uses new lowering method
and make G_FMINNUM_IEEE and G_FMAXNUM_IEEE legal to match SDag.
Lukacma pushed a commit to Lukacma/llvm-project that referenced this pull request Oct 29, 2025
Add GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same
logic as in SDag's expandFMINIMUM_FMAXIMUM.
Update AMDGPU legalization rules: Pre GFX12 now uses new lowering method
and make G_FMINNUM_IEEE and G_FMAXNUM_IEEE legal to match SDag.
aokblast pushed a commit to aokblast/llvm-project that referenced this pull request Oct 30, 2025
Add GlobalISel lowering of G_FMINIMUM and G_FMAXIMUM following the same
logic as in SDag's expandFMINIMUM_FMAXIMUM.
Update AMDGPU legalization rules: Pre GFX12 now uses new lowering method
and make G_FMINNUM_IEEE and G_FMAXNUM_IEEE legal to match SDag.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants