@@ -17,6 +17,7 @@ SPDX-License-Identifier: MIT
1717#include " llvm/IR/InstIterator.h"
1818#include " llvm/Support/MemoryBuffer.h"
1919#include " llvm/Support/GenericDomTree.h"
20+ #include " llvm/Transforms/Utils/Cloning.h"
2021#include " llvm/Bitcode/BitcodeReader.h"
2122#include " llvm/Bitcode/BitcodeWriter.h"
2223#include " llvm/Linker/Linker.h"
@@ -632,11 +633,16 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
632633 m_changed = false ;
633634
634635 // When we test it, we need to set emuKind
635- if (IGC_IS_FLAG_ENABLED (TestIGCPreCompiledFunctions))
636+ if (IGC_GET_FLAG_VALUE (TestIGCPreCompiledFunctions) == 1 )
636637 {
637638 m_emuKind = EmuKind::EMU_DP;
638639 checkAndSetEnableSubroutine ();
639640 }
641+ else if (IGC_GET_FLAG_VALUE (TestIGCPreCompiledFunctions) == 2 )
642+ {
643+ m_emuKind = EmuKind::EMU_DP_DIV_SQRT;
644+ checkAndSetEnableSubroutine ();
645+ }
640646 // sanity check
641647 if (m_emuKind == 0 ) {
642648 // Nothing to emulate
@@ -826,12 +832,11 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
826832 }
827833 }
828834
829- unsigned totalNumberOfInlinedInst = 0 ;
835+ llvm::SmallVector<ImportedFunction, 32 > importedFunctions;
836+ unsigned totalNumberOfInlinedInst = 0 , totalNumberOfPotentiallyInlinedInst = 0 ;
830837 int emuFC = (int )IGC_GET_FLAG_VALUE (EmulationFunctionControl);
831838
832- // Post processing, set those imported functions as internal linkage
833- // and alwaysinline. Also count how many instructions would be added
834- // to the shader if inlining occurred.
839+ // Post processing, set those imported functions as internal linkage.
835840 for (auto II = M.begin (), IE = M.end (); II != IE; )
836841 {
837842 Function* Func = &(*II);
@@ -853,92 +858,101 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
853858 continue ;
854859 }
855860
856- // Remove noinline/AlwaysInline attr if present.
857- Func->removeFnAttr (llvm::Attribute::NoInline);
861+ if (std::find (importedFunctions.begin (), importedFunctions.end (), Func) == importedFunctions.end ())
862+ importedFunctions.push_back (Func);
863+ }
864+ else
865+ {
866+ // Make sure original func isn't inlined accidentally.
858867 Func->removeFnAttr (llvm::Attribute::AlwaysInline);
868+ }
869+ }
859870
860- if (m_enableCallForEmulation &&
861- emuFC != FLAG_FCALL_DEFAULT &&
862- emuFC != FLAG_FCALL_FORCE_INLINE)
863- {
864- // Disable inlining completely.
865- continue ;
866- }
867-
868- if (Func->hasOneUse () || emuFC == FLAG_FCALL_FORCE_INLINE)
869- {
870- Func->addFnAttr (llvm::Attribute::AlwaysInline);
871- continue ;
872- }
871+ // Sort imported instructions in preferred inlining order.
872+ std::sort (importedFunctions.begin (), importedFunctions.end (), ImportedFunction::compare);
873873
874- // Count number of instructions in the function
875- unsigned NumInst = 0 ;
876- for (BasicBlock& BB : Func->getBasicBlockList ()) {
877- NumInst += BB.getInstList ().size ();
878- }
874+ // Post processing, set those imported functions as alwaysinline.
875+ // Also count how many instructions would be added to the shader
876+ // if inlining occurred.
877+ for (auto II = importedFunctions.begin (), IE = importedFunctions.end (); II != IE; ++II)
878+ {
879+ Function* Func = II->F ;
879880
880- // Don't want to subroutine small functions
881- if (NumInst <= 5 )
882- {
883- // Add AlwaysInline attribute to force inlining all calls.
884- Func->addFnAttr (llvm::Attribute::AlwaysInline);
881+ // Remove noinline/AlwaysInline attr if present.
882+ Func->removeFnAttr (llvm::Attribute::NoInline);
883+ Func->removeFnAttr (llvm::Attribute::AlwaysInline);
885884
886- continue ;
887- }
885+ if (m_enableCallForEmulation &&
886+ emuFC != FLAG_FCALL_DEFAULT &&
887+ emuFC != FLAG_FCALL_FORCE_INLINE)
888+ {
889+ // Disable inlining completely.
890+ continue ;
891+ }
888892
889- totalNumberOfInlinedInst += NumInst * Func->getNumUses ();
893+ if (Func->hasOneUse () || emuFC == FLAG_FCALL_FORCE_INLINE)
894+ {
895+ Func->addFnAttr (llvm::Attribute::AlwaysInline);
896+ continue ;
890897 }
891- else
898+
899+ // Don't want to subroutine small functions
900+ if (II->funcInstructions <= 5 )
892901 {
893- // Make sure original func isn't inlined accidentally.
894- Func->removeFnAttr (llvm::Attribute::AlwaysInline);
902+ // Add AlwaysInline attribute to force inlining all calls.
903+ Func->addFnAttr (llvm::Attribute::AlwaysInline);
904+
905+ continue ;
895906 }
896- }
897907
898- // If true, it is a slow version of DP emu functions. Those functions
899- // are the original ones for just passing conformance, not for perf.
900- auto isSlowDPEmuFunc = [](Function* F) {
901- StringRef FN = F->getName ();
902- if (FN.equals (" __igcbuiltin_dp_add" ) ||
903- FN.equals (" __igcbuiltin_dp_sub" ) ||
904- FN.equals (" __igcbuiltin_dp_fma" ) ||
905- FN.equals (" __igcbuiltin_dp_mul" ) ||
906- FN.equals (" __igcbuiltin_dp_div" ) ||
907- FN.equals (" __igcbuiltin_dp_cmp" ) ||
908- FN.equals (" __igcbuiltin_dp_to_int32" ) ||
909- FN.equals (" __igcbuiltin_dp_to_uint32" ) ||
910- FN.equals (" __igcbuiltin_int32_to_dp" ) ||
911- FN.equals (" __igcbuiltin_uint32_to_dp" ) ||
912- FN.equals (" __igcbuiltin_dp_to_sp" ) ||
913- FN.equals (" __igcbuiltin_sp_to_dp" ) ||
914- FN.equals (" __igcbuiltin_dp_sqrt" )) {
915- return true ;
908+ totalNumberOfPotentiallyInlinedInst += II->totalInstructions ;
909+
910+ // If function fits in threshold, always inline.
911+ if (totalNumberOfInlinedInst + II->totalInstructions <= (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold))
912+ {
913+ totalNumberOfInlinedInst += II->totalInstructions ;
914+ Func->addFnAttr (llvm::Attribute::AlwaysInline);
916915 }
917- return false ;
918- };
916+ }
919917
920- for (auto II = M.begin (), IE = M.end (); II != IE; )
918+ // Check if more functions can fit in threshold if they would be split into inline/noinline copies.
919+ if (m_enableCallForEmulation && emuFC == FLAG_FCALL_DEFAULT && totalNumberOfInlinedInst < (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold))
921920 {
922- Function* Func = &(*II);
923- ++II;
924- if (!Func || Func->isDeclaration ())
921+ for (auto II = importedFunctions.begin (); II != importedFunctions.end (); ++II)
925922 {
926- continue ;
923+ Function* Func = II->F ;
924+
925+ if (Func->hasFnAttribute (llvm::Attribute::AlwaysInline))
926+ continue ;
927+
928+ unsigned calls = ((unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold) - totalNumberOfInlinedInst) / II->funcInstructions ;
929+ if (calls > 0 )
930+ {
931+ // Split function into inline/no-inline copies.
932+ ImportedFunction copy = createInlinedCopy (*II, calls);
933+ importedFunctions.push_back (copy);
934+ totalNumberOfInlinedInst += copy.totalInstructions ;
935+ }
927936 }
937+ }
928938
929- if (!origFunctions.count (Func) && !Func->hasFnAttribute (llvm::Attribute::AlwaysInline))
939+ for (auto II = importedFunctions.begin (), IE = importedFunctions.end (); II != IE; ++II)
940+ {
941+ Function* Func = II->F ;
942+
943+ if (!Func->hasFnAttribute (llvm::Attribute::AlwaysInline))
930944 {
931945 // Special handling of DP functions: any one that has not been marked as inline
932946 // at this point, it will be either subroutine or stackcall.
933- const bool isDPCallFunc = (isDPEmu () && isSlowDPEmuFunc (Func ));
947+ const bool isDPCallFunc = (isDPEmu () && II-> isSlowDPEmuFunc ());
934948
935949 // Use subroutine/stackcall for some DP emulation functions if
936950 // EmulationFunctionControl is set so, or
937951 // use subroutines if total number of instructions added when
938952 // all emulated functions are inlined exceed InlinedEmulationThreshold.
939953 // If Func is a slow version of DP emu func, perf isn't important.
940954 if (m_enableCallForEmulation &&
941- (totalNumberOfInlinedInst > (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold) ||
955+ (totalNumberOfPotentiallyInlinedInst > (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold) ||
942956 isDPCallFunc))
943957 {
944958 Func->addFnAttr (llvm::Attribute::NoInline);
@@ -1003,6 +1017,128 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
10031017 return m_changed;
10041018}
10051019
1020+ PreCompiledFuncImport::ImportedFunction::ImportedFunction (Function* F)
1021+ : F(F), type(EmuType::OTHER), funcInstructions(0 ), totalInstructions(0 )
1022+ {
1023+ // Count number of new instructions added by inlining.
1024+ for (BasicBlock& BB : F->getBasicBlockList ())
1025+ funcInstructions += BB.getInstList ().size ();
1026+
1027+ updateUses ();
1028+
1029+ // Get type of imported function.
1030+ StringRef name = F->getName ();
1031+
1032+ if (name.equals (" __igcbuiltin_dp_div_nomadm_ieee" ) ||
1033+ name.equals (" __igcbuiltin_dp_div_nomadm_fast" ) ||
1034+ name.equals (" __igcbuiltin_dp_sqrt_nomadm_ieee" ) ||
1035+ name.equals (" __igcbuiltin_dp_sqrt_nomadm_fast" ))
1036+ {
1037+ type = EmuType::FASTDP;
1038+ }
1039+ else if (name.equals (" __igcbuiltin_dp_add" ) ||
1040+ name.equals (" __igcbuiltin_dp_sub" ) ||
1041+ name.equals (" __igcbuiltin_dp_fma" ) ||
1042+ name.equals (" __igcbuiltin_dp_mul" ) ||
1043+ name.equals (" __igcbuiltin_dp_div" ) ||
1044+ name.equals (" __igcbuiltin_dp_cmp" ) ||
1045+ name.equals (" __igcbuiltin_dp_to_int32" ) ||
1046+ name.equals (" __igcbuiltin_dp_to_uint32" ) ||
1047+ name.equals (" __igcbuiltin_int32_to_dp" ) ||
1048+ name.equals (" __igcbuiltin_uint32_to_dp" ) ||
1049+ name.equals (" __igcbuiltin_dp_to_sp" ) ||
1050+ name.equals (" __igcbuiltin_sp_to_dp" ) ||
1051+ name.equals (" __igcbuiltin_dp_sqrt" ))
1052+ {
1053+ // If true, it is a slow version of DP emu functions. Those functions
1054+ // are the original ones for just passing conformance, not for perf.
1055+ type = EmuType::SLOWDP;
1056+ }
1057+ else
1058+ {
1059+ for (int i = 0 ; i < NUM_FUNCTIONS && type == EmuType::OTHER; ++i)
1060+ {
1061+ for (int j = 0 ; j < NUM_TYPES && type == EmuType::OTHER; ++j)
1062+ {
1063+ if (name.equals (m_Int64SpDivRemFunctionNames[i][j]) ||
1064+ name.equals (m_Int64DpDivRemFunctionNames[i][j]))
1065+ {
1066+ type = EmuType::INT64;
1067+ }
1068+ }
1069+ }
1070+ }
1071+ }
1072+
1073+ void PreCompiledFuncImport::ImportedFunction::updateUses ()
1074+ {
1075+ totalInstructions = funcInstructions * F->getNumUses ();
1076+ }
1077+
1078+ PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::ImportedFunction::copy (ImportedFunction& other)
1079+ {
1080+ ValueToValueMapTy VM;
1081+ Function* copy = CloneFunction (other.F , VM);
1082+ return PreCompiledFuncImport::ImportedFunction (copy, other.type , other.funcInstructions , 0 );
1083+ }
1084+
1085+ // Compare two imported functions in order preferred for inlining.
1086+ bool PreCompiledFuncImport::ImportedFunction::compare (ImportedFunction& L, ImportedFunction& R)
1087+ {
1088+ // First sort by preferred type of emulation.
1089+ if (L.type != R.type )
1090+ return L.type < R.type ;
1091+
1092+ // Then sort by number of inlined instructions.
1093+ return L.totalInstructions < R.totalInstructions ;
1094+ };
1095+
1096+ PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::createInlinedCopy (ImportedFunction& IF, unsigned n)
1097+ {
1098+ std::vector<CallInst*> toDelete;
1099+
1100+ // Make copy that is always inlined.
1101+ ImportedFunction copy = ImportedFunction::copy (IF);
1102+ copy.F ->setName (IF.F ->getName () + " _always_inline" );
1103+ copy.F ->addFnAttr (llvm::Attribute::AlwaysInline);
1104+
1105+ // Collect first n calls to replace with copy.
1106+ llvm::SmallVector<CallInst*, 8 > calls;
1107+ auto it = IF.F ->user_begin ();
1108+ for (unsigned i = 0 ; i < n; ++i)
1109+ {
1110+ CallInst* oldCall = dyn_cast<CallInst>(*(it++));
1111+ IGC_ASSERT (oldCall);
1112+ calls.push_back (oldCall);
1113+ }
1114+
1115+ // Replace with always inlined copy.
1116+ for (CallInst* oldCall : calls)
1117+ {
1118+ std::vector<Value*> args;
1119+ for (unsigned arg = 0 ; arg < IGCLLVM::getNumArgOperands (oldCall); ++arg)
1120+ args.push_back (oldCall->getArgOperand (arg));
1121+
1122+ // Create new call and insert it before old one
1123+ CallInst* newCall = CallInst::Create (copy.F , args, " " , oldCall);
1124+
1125+ newCall->setCallingConv (copy.F ->getCallingConv ());
1126+ newCall->setAttributes (oldCall->getAttributes ());
1127+ newCall->setDebugLoc (oldCall->getDebugLoc ());
1128+
1129+ oldCall->replaceAllUsesWith (newCall);
1130+ toDelete.push_back (oldCall);
1131+ }
1132+
1133+ for (auto C : toDelete)
1134+ C->eraseFromParent ();
1135+
1136+ copy.updateUses ();
1137+ IF.updateUses ();
1138+
1139+ return copy;
1140+ }
1141+
10061142void PreCompiledFuncImport::visitBinaryOperator (BinaryOperator& I)
10071143{
10081144 if (I.getOperand (0 )->getType ()->isIntOrIntVectorTy ())
@@ -2547,6 +2683,7 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
25472683 bool SPDiv = isSPDiv ();
25482684 bool DPEmu = isDPEmu ();
25492685 bool DPDivSqrtEmu = isDPDivSqrtEmu ();
2686+ bool I64DivRem = isI64DivRem ();
25502687
25512688 Module* M = m_pCtx->getModule ();
25522689 for (auto FI = M->begin (), FE = M->end (); FI != FE; ++FI)
@@ -2589,6 +2726,15 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
25892726 m_enableCallForEmulation = true ;
25902727 }
25912728 break ;
2729+ case Instruction::UDiv:
2730+ case Instruction::URem:
2731+ case Instruction::SDiv:
2732+ case Instruction::SRem:
2733+ if (I64DivRem && I->getOperand (0 )->getType ()->isIntegerTy (64 ))
2734+ {
2735+ m_enableCallForEmulation = true ;
2736+ }
2737+ break ;
25922738 }
25932739
25942740 GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(I);
0 commit comments