diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index eaa1870f4be28..5a08e7d6db347 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1238,6 +1238,13 @@ def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst", // Subtarget Features (options and debugging) //===------------------------------------------------------------===// +def FeatureDisableAggressiveFMAFusion : SubtargetFeature< + "disable-aggressive-fma-fusion", + "DisableAggressiveFMAFusion", + "true", + "Do not fold fmul and fadd/fsub into fma." +>; + // Ugly hack to accomodate assembling modules with mixed // wavesizes. Ideally we would have a mapping symbol in assembly which // would keep track of which sections of code should be treated as diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index ed03ef21b6dda..0c380a7e4dc84 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -76,6 +76,7 @@ class AMDGPUSubtarget { bool EnablePromoteAlloca = false; bool HasTrigReducedRange = false; bool FastFMAF32 = false; + bool DisableAggressiveFMAFusion = false; unsigned EUsPerCU = 4; unsigned MaxWavesPerEU = 10; unsigned LocalMemorySize = 0; @@ -303,6 +304,10 @@ class AMDGPUSubtarget { return FastFMAF32; } + bool hasDisableAggressiveFMAFusion() const { + return DisableAggressiveFMAFusion; + } + bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 03d16fdd54c42..554549063dbcc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -288,6 +288,7 @@ const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal, AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode, + AMDGPU::FeatureDisableAggressiveFMAFusion, AMDGPU::FeatureAutoWaitcntBeforeBarrier, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 16530087444d2..f0ac08c3b69f9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6502,10 +6502,12 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { // When fma is quarter rate, for f64 where add / sub are at best half rate, // most of these combines appear to be cycle neutral but save on instruction // count / code size. - return true; + return !Subtarget->hasDisableAggressiveFMAFusion(); } -bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; } +bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { + return !Subtarget->hasDisableAggressiveFMAFusion(); +} EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const {