1717#include < list>
1818#include < map>
1919#include < shared_mutex>
20+ #include < unordered_map>
21+ #include < unordered_set>
2022#include < vector>
2123
2224#include " ExclusiveAccess.h"
@@ -58,6 +60,7 @@ struct GenericPluginTy;
5860struct GenericKernelTy ;
5961struct GenericDeviceTy ;
6062struct RecordReplayTy ;
63+ struct KernelRunRecord ;
6164
6265// / Class that wraps the __tgt_async_info to simply its usage. In case the
6366// / object is constructed without a valid __tgt_async_info, the object will use
@@ -1105,6 +1108,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
11051108
11061109 bool getMultiDeviceKernelValue (void *EntryPtr);
11071110
1111+ KernelRunRecord *getKernelRunRecords () const { return KernelRunRecords; }
1112+
11081113 // / Return true if a descriptor of size 'Size' should be allocated using
11091114 // / shared memory. Default implementation returns 'false',
11101115 virtual bool useSharedMemForDescriptor (int64_t Size);
@@ -1256,6 +1261,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
12561261 // / This is used to run the RPC server during task synchronization.
12571262 RPCServerTy *RPCServer;
12581263
1264+ // / Structs for functions and data used in runtime autotuning.
1265+ KernelRunRecord *KernelRunRecords;
1266+
12591267private:
12601268#ifdef OMPT_SUPPORT
12611269 // / OMPT callback functions
@@ -1282,6 +1290,99 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
12821290 bool IsFastReductionEnabled = false ;
12831291};
12841292
1293+ // / Struct represents the metadata for each kernel run on the device.
1294+ struct KernelRunRecord {
1295+
1296+ struct KernelRunEntry {
1297+ std::string KernelName;
1298+ uint32_t NumTeams;
1299+ uint32_t NumThreads;
1300+ uint64_t RunDuration;
1301+ };
1302+
1303+ // Metadata used in tuning process.
1304+ struct TuningMetadata {
1305+ uint32_t IdxThread = 0 ;
1306+ uint32_t IdxCUMultiplier = 0 ;
1307+ // Tuning history.
1308+ std::vector<KernelRunEntry> RunEntries;
1309+ // Run counters.
1310+ uint32_t RunCounters;
1311+ // Entry with minimum running time.
1312+ KernelRunEntry MinEntries;
1313+ };
1314+
1315+ // Add a new entry
1316+ void addEntry (std::string KernelName, uint32_t NumTeams, uint32_t NumThreads,
1317+ uint64_t RunDuration) {
1318+ KernelRunEntry NewRunEnry = {KernelName, NumTeams, NumThreads, RunDuration};
1319+ TuningData[KernelName].RunEntries .push_back (NewRunEnry);
1320+ TuningData[KernelName].RunCounters ++;
1321+
1322+ // Update min entries.
1323+ auto MinDuration = TuningData[KernelName].MinEntries .RunDuration ;
1324+ if (MinDuration > RunDuration || MinDuration == 0 ) {
1325+ TuningData[KernelName].MinEntries = NewRunEnry;
1326+ }
1327+ }
1328+
1329+ // Get parameters for next kernel launch.
1330+ std::pair<uint32_t , uint32_t >
1331+ getLaunchParamsForKernel (std::string KernelName,
1332+ GenericDeviceTy &GenericDevice) {
1333+ // If the kernel reaches the run limit,
1334+ // return the current optimal launch parameters.
1335+ if (reachedRunLimitForKernel (KernelName)) {
1336+ auto MinEntry = TuningData[KernelName].MinEntries ;
1337+ return {MinEntry.NumTeams , MinEntry.NumThreads };
1338+ }
1339+
1340+ // Pick new launch parameters.
1341+ uint32_t IdxCUMulti = TuningData[KernelName].IdxCUMultiplier ;
1342+ uint32_t IdxThread = TuningData[KernelName].IdxThread ;
1343+
1344+ if (IdxCUMulti >= CUMultiplierCandidate.size ()) {
1345+ // No more element to search.
1346+ // Return current optimal launch parameters.
1347+ return {TuningData[KernelName].MinEntries .NumTeams ,
1348+ TuningData[KernelName].MinEntries .NumThreads };
1349+ }
1350+
1351+ // New team/thread pair for launch parameters.
1352+ uint32_t NumCU = GenericDevice.getNumComputeUnits ();
1353+ std::pair<uint32_t , uint32_t > NewLaunchParams = {
1354+ CUMultiplierCandidate[IdxCUMulti] * NumCU, ThreadCandidate[IdxThread]};
1355+
1356+ // Update indices.
1357+ IdxThread++;
1358+ TuningData[KernelName].IdxThread = IdxThread;
1359+
1360+ if (IdxThread >= ThreadCandidate.size ()) {
1361+ TuningData[KernelName].IdxThread = 0 ;
1362+ TuningData[KernelName].IdxCUMultiplier ++;
1363+ }
1364+
1365+ return NewLaunchParams;
1366+ }
1367+
1368+ bool reachedRunLimitForKernel (std::string KernelName) {
1369+ return TuningData[KernelName].RunCounters > RunLimiter;
1370+ }
1371+
1372+ uint32_t getRunCounterForKernel (std::string KernelName) {
1373+ return TuningData[KernelName].RunCounters ;
1374+ }
1375+
1376+ private:
1377+ // Candidates for thread and team.
1378+ std::vector<uint32_t > ThreadCandidate = {32 , 64 , 128 , 256 , 512 , 1024 };
1379+ std::vector<uint32_t > CUMultiplierCandidate = {4 , 8 , 16 , 32 , 64 , 128 };
1380+ // The max number of tuning runs for each kernel.
1381+ uint32_t RunLimiter = ThreadCandidate.size() * CUMultiplierCandidate.size();
1382+ // Used for keeping track of the metatdata used in tuning for each kernel.
1383+ std::unordered_map<std::string, TuningMetadata> TuningData;
1384+ };
1385+
12851386// / Class implementing common functionalities of offload plugins. Each plugin
12861387// / should define the specific plugin class, derive from this generic one, and
12871388// / implement the necessary virtual function members.
0 commit comments