Skip to content

Commit 82b3142

Browse files
authored
track tool overhead (#252)
* basic infrastructure for tracking tool overhead * add tool overhead tags for device timing and finish after enqueue * cache the process ID * switch to sprintf and write vs stream insertion operators * fix Linux build * add tool overhead tags for adding timing events * add a control for tool overhead tracking * remove a stray ifdef * add special cases to reduce string operations for chrome call logging * open files as binary instead of text * add comments, cosmetic clean up
1 parent 788c47c commit 82b3142

File tree

5 files changed

+200
-90
lines changed

5 files changed

+200
-90
lines changed

docs/controls.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,10 @@ If set to a nonzero value, the Intercept Layer for OpenCL Applications will writ
259259

260260
If set to a nonzero value, the Intercept Layer for OpenCL Applications will track the minimum, maximum, and average host CPU time for each OpenCL entry point. When the process exits, this information will be included in the file "clIntercept\_report.txt".
261261

262+
##### `ToolOverheadTiming` (bool)
263+
264+
If set to a nonzero value, the Intercept Layer for OpenCL Applications will include some types of tool overhead in timing reports and some types of logging.
265+
262266
##### `DevicePerformanceTiming` (bool)
263267

264268
If set to a nonzero value, the Intercept Layer for OpenCL Applications will add event profiling to track the minimum, maximum, and average device time for each OpenCL command. This operation may be fairly intrusive and may have side effects; in particular it forces all command queues to be created with PROFILING\_ENABLED and may increment the reference count for application events. When the process exits, this information will be included in the file "clIntercept\_report.txt".

intercept/mdapi/intercept_mdapi.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ void CLIntercept::initCustomPerfCounters()
201201

202202
OS().MakeDumpDirectories( fileName );
203203

204-
m_MetricDump.open( fileName.c_str(), std::ios::out );
204+
m_MetricDump.open( fileName.c_str(), std::ios::out | std::ios::binary );
205205

206206
m_pMDHelper->PrintMetricNames( m_MetricDump );
207207
m_pMDHelper->PrintMetricUnits( m_MetricDump );

intercept/src/controls.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ CLI_CONTROL( bool, ReportToFile, true, "If s
4949

5050
CLI_CONTROL_SEPARATOR( Performance Timing Controls: )
5151
CLI_CONTROL( bool, HostPerformanceTiming, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will track the minimum, maximum, and average host CPU time for each OpenCL entry point. When the process exits, this information will be included in the file \"clIntercept_report.txt\"." )
52+
CLI_CONTROL( bool, ToolOverheadTiming, true, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will include some types of tool overhead in timing reports and some types of logging." )
5253
CLI_CONTROL( bool, DevicePerformanceTiming, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will add event profiling to track the minimum, maximum, and average device time for each OpenCL command. This operation may be fairly intrusive and may have side effects; in particular it forces all command queues to be created with PROFILING_ENABLED and may increment the reference count for application events. When the process exits, this information will be included in the file \"clIntercept_report.txt\"." )
5354
CLI_CONTROL( bool, DevicePerformanceTimeKernelInfoTracking,false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will distinguish between OpenCL NDRange kernels using information such as the kernel's Preferred Work Group Size Multiple (AKA SIMD size)." )
5455
CLI_CONTROL( bool, DevicePerformanceTimeGWOTracking, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will distinguish between OpenCL NDRange kernels with different global work offsets for the purpose of device performance timing." )

intercept/src/intercept.cpp

Lines changed: 126 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ void CLIntercept::Delete( CLIntercept*& pIntercept )
104104
CLIntercept::CLIntercept( void* pGlobalData )
105105
: m_OS( pGlobalData )
106106
{
107+
m_ProcessId = m_OS.GetProcessID();
108+
107109
m_Dispatch = {0};
108110
m_DispatchX[NULL] = {0};
109111

@@ -371,11 +373,15 @@ bool CLIntercept::init()
371373

372374
if( m_Config.AppendFiles )
373375
{
374-
m_InterceptLog.open( fileName.c_str(), std::ios::out | std::ios::app );
376+
m_InterceptLog.open(
377+
fileName.c_str(),
378+
std::ios::out | std::ios::binary | std::ios::app );
375379
}
376380
else
377381
{
378-
m_InterceptLog.open( fileName.c_str(), std::ios::out );
382+
m_InterceptLog.open(
383+
fileName.c_str(),
384+
std::ios::out | std::ios::binary );
379385
}
380386
}
381387

@@ -389,14 +395,15 @@ bool CLIntercept::init()
389395
fileName += sc_TraceFileName;
390396

391397
OS().MakeDumpDirectories( fileName );
392-
m_InterceptTrace.open( fileName.c_str(), std::ios::out );
398+
m_InterceptTrace.open(
399+
fileName.c_str(),
400+
std::ios::out | std::ios::binary );
393401
m_InterceptTrace << "[\n";
394402

395-
uint64_t processId = OS().GetProcessID();
396403
uint64_t threadId = OS().GetThreadID();
397404
std::string processName = OS().GetProcessName();
398405
m_InterceptTrace
399-
<< "{\"ph\":\"M\", \"name\":\"process_name\", \"pid\":" << processId
406+
<< "{\"ph\":\"M\", \"name\":\"process_name\", \"pid\":" << m_ProcessId
400407
<< ", \"tid\":" << threadId
401408
<< ", \"args\":{\"name\":\"" << processName
402409
<< "\"}},\n";
@@ -581,14 +588,13 @@ bool CLIntercept::init()
581588
if( m_Config.ChromeCallLogging ||
582589
m_Config.ChromePerformanceTiming )
583590
{
584-
uint64_t processId = OS().GetProcessID();
585591
uint64_t threadId = OS().GetThreadID();
586592

587593
using us = std::chrono::microseconds;
588594
uint64_t usStartTime =
589595
std::chrono::duration_cast<us>(m_StartTime.time_since_epoch()).count();
590596
m_InterceptTrace
591-
<< "{\"ph\":\"M\", \"name\":\"clintercept_start_time\", \"pid\":" << processId
597+
<< "{\"ph\":\"M\", \"name\":\"clintercept_start_time\", \"pid\":" << m_ProcessId
592598
<< ", \"tid\":" << threadId
593599
<< ", \"args\":{\"start_time\":" << usStartTime
594600
<< "}},\n";
@@ -714,11 +720,15 @@ void CLIntercept::report()
714720
std::ofstream os;
715721
if( m_Config.AppendFiles )
716722
{
717-
os.open( filepath, std::ios::out | std::ios::binary | std::ios::app );
723+
os.open(
724+
filepath,
725+
std::ios::out | std::ios::binary | std::ios::app );
718726
}
719727
else
720728
{
721-
os.open( filepath, std::ios::out | std::ios::binary );
729+
os.open(
730+
filepath,
731+
std::ios::out | std::ios::binary );
722732
}
723733
if( os.good() )
724734
{
@@ -13061,26 +13071,7 @@ void CLIntercept::chromeCallLoggingExit(
1306113071
{
1306213072
std::lock_guard<std::mutex> lock(m_Mutex);
1306313073

13064-
std::string name;
13065-
name += functionName;
13066-
13067-
if( !tag.empty() )
13068-
{
13069-
name += "( ";
13070-
name += tag;
13071-
name += " )";
13072-
}
13073-
13074-
std::ostringstream args;
13075-
if( includeId )
13076-
{
13077-
args << ", \"args\":{\"id\":" << enqueueCounter << "}";
13078-
}
13079-
13080-
uint64_t processId =
13081-
OS().GetProcessID();
13082-
uint64_t threadId =
13083-
OS().GetThreadID();
13074+
uint64_t threadId = OS().GetThreadID();
1308413075

1308513076
// This will name the thread if it is not named already.
1308613077
getThreadNumber( threadId );
@@ -13091,14 +13082,65 @@ void CLIntercept::chromeCallLoggingExit(
1309113082
uint64_t usDelta =
1309213083
std::chrono::duration_cast<us>(tickEnd - tickStart).count();
1309313084

13094-
m_InterceptTrace
13095-
<< "{\"ph\":\"X\", \"pid\":" << processId
13096-
<< ", \"tid\":" << threadId
13097-
<< ", \"name\":\"" << name
13098-
<< "\", \"ts\":" << usStart
13099-
<< ", \"dur\":" << usDelta
13100-
<< args.str()
13101-
<< "},\n";
13085+
// Notes for the future:
13086+
// Printing into a pre-allocated string buffer and then writing is
13087+
// measured to be faster than calling fprintf or stream insertion
13088+
// operators.
13089+
// Handling each of these four cases separately eliminates the need
13090+
// to concatenate strings and reduces overhead.
13091+
13092+
if( !tag.empty() && includeId )
13093+
{
13094+
int size = CLI_SPRINTF(m_StringBuffer, CLI_STRING_BUFFER_SIZE,
13095+
"{\"ph\":\"X\",\"pid\":%" PRIu64 ",\"tid\":%" PRIu64 ",\"name\":\"%s( %s )\""
13096+
",\"ts\":%" PRIu64 ",\"dur\":%" PRIu64 ",\"args\":{\"id\":%" PRIu64 "}},\n",
13097+
m_ProcessId,
13098+
threadId,
13099+
functionName.c_str(),
13100+
tag.c_str(),
13101+
usStart,
13102+
usDelta,
13103+
enqueueCounter );
13104+
m_InterceptTrace.write(m_StringBuffer, size);
13105+
}
13106+
else if( !tag.empty() )
13107+
{
13108+
int size = CLI_SPRINTF(m_StringBuffer, CLI_STRING_BUFFER_SIZE,
13109+
"{\"ph\":\"X\",\"pid\":%" PRIu64 ",\"tid\":%" PRIu64 ",\"name\":\"%s( %s )\""
13110+
",\"ts\":%" PRIu64 ",\"dur\":%" PRIu64 "},\n",
13111+
m_ProcessId,
13112+
threadId,
13113+
functionName.c_str(),
13114+
tag.c_str(),
13115+
usStart,
13116+
usDelta );
13117+
m_InterceptTrace.write(m_StringBuffer, size);
13118+
}
13119+
else if( includeId )
13120+
{
13121+
int size = CLI_SPRINTF(m_StringBuffer, CLI_STRING_BUFFER_SIZE,
13122+
"{\"ph\":\"X\",\"pid\":%" PRIu64 ",\"tid\":%" PRIu64 ",\"name\":\"%s\""
13123+
",\"ts\":%" PRIu64 ",\"dur\":%" PRIu64 ",\"args\":{\"id\":%" PRIu64 "}},\n",
13124+
m_ProcessId,
13125+
threadId,
13126+
functionName.c_str(),
13127+
usStart,
13128+
usDelta,
13129+
enqueueCounter );
13130+
m_InterceptTrace.write(m_StringBuffer, size);
13131+
}
13132+
else
13133+
{
13134+
int size = CLI_SPRINTF(m_StringBuffer, CLI_STRING_BUFFER_SIZE,
13135+
"{\"ph\":\"X\",\"pid\":%" PRIu64 ",\"tid\":%" PRIu64 ",\"name\":\"%s\""
13136+
",\"ts\":%" PRIu64 ",\"dur\":%" PRIu64 "},\n",
13137+
m_ProcessId,
13138+
threadId,
13139+
functionName.c_str(),
13140+
usStart,
13141+
usDelta );
13142+
m_InterceptTrace.write(m_StringBuffer, size);
13143+
}
1310213144
}
1310313145

1310413146
///////////////////////////////////////////////////////////////////////////////
@@ -13192,14 +13234,13 @@ void CLIntercept::chromeRegisterCommandQueue(
1319213234
}
1319313235
}
1319413236

13195-
uint64_t processId = OS().GetProcessID();
1319613237
m_InterceptTrace
13197-
<< "{\"ph\":\"M\", \"name\":\"thread_name\", \"pid\":" << processId
13238+
<< "{\"ph\":\"M\", \"name\":\"thread_name\", \"pid\":" << m_ProcessId
1319813239
<< ", \"tid\":-" << queueNumber
1319913240
<< ", \"args\":{\"name\":\"" << trackName
1320013241
<< "\"}},\n";
1320113242
m_InterceptTrace
13202-
<< "{\"ph\":\"M\", \"name\":\"thread_sort_index\", \"pid\":" << processId
13243+
<< "{\"ph\":\"M\", \"name\":\"thread_sort_index\", \"pid\":" << m_ProcessId
1320313244
<< ", \"tid\":-" << queueNumber
1320413245
<< ", \"args\":{\"sort_index\":\"" << queueNumber
1320513246
<< "\"}},\n";
@@ -13288,8 +13329,6 @@ void CLIntercept::chromeTraceEvent(
1328813329
// deltaNS, deltaNS / 1000.0 );
1328913330
//}
1329013331

13291-
const uint64_t processId = OS().GetProcessID();
13292-
1329313332
if( m_Config.ChromePerformanceTimingInStages )
1329413333
{
1329513334
const size_t cNumStates = 3;
@@ -13318,27 +13357,34 @@ void CLIntercept::chromeTraceEvent(
1331813357
{
1331913358
if( m_Config.ChromePerformanceTimingPerKernel )
1332013359
{
13321-
m_InterceptTrace
13322-
<< "{\"name\":\"" << name << " " << suffixes[state]
13323-
<< "\", \"ph\":\"X\", \"pid\":" << processId
13324-
<< ", \"tid\":\"" << name
13325-
<< "\", \"ts\":" << usStarts[state]
13326-
<< ", \"dur\":" << usDeltas[state]
13327-
<< ", \"cname\":\"" << colours[state]
13328-
<< "\", \"args\":{\"id\":" << enqueueCounter
13329-
<< "}},\n";
13360+
int size = CLI_SPRINTF(m_StringBuffer, CLI_STRING_BUFFER_SIZE,
13361+
"{\"ph\":\"X\",\"pid\":%" PRIu64 ",\"tid\":\"%s\",\"name\":\"%s %s\""
13362+
",\"ts\":%" PRIu64 ",\"dur\":%" PRIu64 ",\"cname\":\"%s\",\"args\":{\"id\":%" PRIu64 "}},\n",
13363+
m_ProcessId,
13364+
name.c_str(),
13365+
name.c_str(),
13366+
suffixes[state].c_str(),
13367+
usStarts[state],
13368+
usDeltas[state],
13369+
colours[state].c_str(),
13370+
enqueueCounter );
13371+
m_InterceptTrace.write(m_StringBuffer, size);
1333013372
}
1333113373
else
1333213374
{
13333-
m_InterceptTrace
13334-
<< "{\"name\":\"" << name << " " << suffixes[state]
13335-
<< "\", \"ph\":\"X\", \"pid\":" << processId
13336-
<< ", \"tid\":" << m_EventsChromeTraced << "." << queueNumber
13337-
<< ", \"ts\":" << usStarts[state]
13338-
<< ", \"dur\":" << usDeltas[state]
13339-
<< ", \"cname\":\"" << colours[state]
13340-
<< "\", \"args\":{\"id\":" << enqueueCounter
13341-
<< "}},\n";
13375+
int size = CLI_SPRINTF(m_StringBuffer, CLI_STRING_BUFFER_SIZE,
13376+
"{\"ph\":\"X\",\"pid\":%" PRIu64 ",\"tid\":%u.%u,\"name\":\"%s %s\""
13377+
",\"ts\":%" PRIu64 ",\"dur\":%" PRIu64 ",\"cname\":\"%s\",\"args\":{\"id\":%" PRIu64 "}},\n",
13378+
m_ProcessId,
13379+
m_EventsChromeTraced,
13380+
queueNumber,
13381+
name.c_str(),
13382+
suffixes[state].c_str(),
13383+
usStarts[state],
13384+
usDeltas[state],
13385+
colours[state].c_str(),
13386+
enqueueCounter );
13387+
m_InterceptTrace.write(m_StringBuffer, size);
1334213388
}
1334313389
}
1334413390
m_EventsChromeTraced++;
@@ -13350,25 +13396,29 @@ void CLIntercept::chromeTraceEvent(
1335013396
const uint64_t usDelta = ( commandEnd - commandStart ) / 1000;
1335113397
if( m_Config.ChromePerformanceTimingPerKernel )
1335213398
{
13353-
m_InterceptTrace
13354-
<< "{\"ph\":\"X\", \"pid\":" << processId
13355-
<< ", \"tid\":\"" << name
13356-
<< "\", \"name\":\"" << name
13357-
<< "\", \"ts\":" << usStart
13358-
<< ", \"dur\":" << usDelta
13359-
<< ", \"args\":{\"id\":" << enqueueCounter
13360-
<< "}},\n";
13399+
int size = CLI_SPRINTF(m_StringBuffer, CLI_STRING_BUFFER_SIZE,
13400+
"{\"ph\":\"X\",\"pid\":%" PRIu64 ",\"tid\":\"%s\",\"name\":\"%s\""
13401+
",\"ts\":%" PRIu64 ",\"dur\":%" PRIu64 ",\"args\":{\"id\":%" PRIu64 "}},\n",
13402+
m_ProcessId,
13403+
name.c_str(),
13404+
name.c_str(),
13405+
usStart,
13406+
usDelta,
13407+
enqueueCounter );
13408+
m_InterceptTrace.write(m_StringBuffer, size);
1336113409
}
1336213410
else
1336313411
{
13364-
m_InterceptTrace
13365-
<< "{\"ph\":\"X\", \"pid\":" << processId
13366-
<< ", \"tid\":-" << queueNumber
13367-
<< ", \"name\":\"" << name
13368-
<< "\", \"ts\":" << usStart
13369-
<< ", \"dur\":" << usDelta
13370-
<< ", \"args\":{\"id\":" << enqueueCounter
13371-
<< "}},\n";
13412+
int size = CLI_SPRINTF(m_StringBuffer, CLI_STRING_BUFFER_SIZE,
13413+
"{\"ph\":\"X\",\"pid\":%" PRIu64 ",\"tid\":-%u,\"name\":\"%s\""
13414+
",\"ts\":%" PRIu64 ",\"dur\":%" PRIu64 ",\"args\":{\"id\":%" PRIu64 "}},\n",
13415+
m_ProcessId,
13416+
queueNumber,
13417+
name.c_str(),
13418+
usStart,
13419+
usDelta,
13420+
enqueueCounter );
13421+
m_InterceptTrace.write(m_StringBuffer, size);
1337213422
}
1337313423
}
1337413424
}

0 commit comments

Comments
 (0)