Skip to content

Commit 7445d3c

Browse files
committed
feat: Add Perfetto tracing backend with OpenCL(TM) GPU timespan support
- Perfetto source file added under (third_party/perfetto/perfetto.cc/.h) - Introduce acl_profile.cpp file to wrap Perfetto APIs and provide static storage. - Add CL command tracing to reconstruct GPU timespans in Perfetto alongside CPU. - Update SConstruct, SConscript, and filelist.json to include new profiling sources Resolves: COMPMID-8337 Signed-off-by: Walid Ben Romdhane <[email protected]> Change-Id: Ibb319670c787675df617ce91fe0c54d4d34d0d26 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/14843 Reviewed-by: Pablo Marquez Tello <[email protected]> Benchmark: Arm Jenkins <[email protected]> Comments-Addressed: Arm Jenkins <[email protected]> Tested-by: Arm Jenkins <[email protected]>
1 parent ab70aa8 commit 7445d3c

File tree

9 files changed

+242764
-0
lines changed

9 files changed

+242764
-0
lines changed

Android.bp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ cc_library_static {
202202
"src/common/cpuinfo/CpuIsaInfo.cpp",
203203
"src/common/cpuinfo/CpuModel.cpp",
204204
"src/common/utils/LegacySupport.cpp",
205+
"src/common/utils/profile/acl_profile.cpp",
205206
"src/core/AccessWindowAutoPadding.cpp",
206207
"src/core/AccessWindowStatic.cpp",
207208
"src/core/AccessWindowTranspose.cpp",

SConscript

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,14 @@ if env['fixed_format_kernels']:
591591
if env["logging"]:
592592
lib_files += filelist['logging']
593593

594+
# Profiling files
595+
if env["profile"]:
596+
lib_files+= filelist['profiling']
597+
if env['opencl']:
598+
lib_files += Glob(os.path.join(Dir('#').path,
599+
'tests', 'framework', 'instruments',
600+
'OpenCLTimer.cpp'))
601+
594602
# C API files
595603
lib_files += filelist['c_api']['common']
596604
lib_files += filelist['c_api']['operators']

SConstruct

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ vars.AddVariables(
9494
BoolVariable("debug", "Debug", False),
9595
BoolVariable("profile", "Profile using Perfetto, forces C++17", False),
9696
EnumVariable("profile_level", "Profile level. If not set, defaults to 0", allowed_values=("0", "1", "2"), default="0"),
97+
EnumVariable("profile_backend", "Profile backend. If not set, defaults to 'perfetto'", allowed_values=("perfetto"), default="perfetto"),
98+
EnumVariable("profile_size", "Profile size in KB. If not set, defaults to 16384", allowed_values=("16384", "32768", "65536", "131072"), default="16384"),
99+
EnumVariable("profile_mode", "Profile mode. If not set, defaults to 'kInProcessBackend'", allowed_values=("kInProcessBackend", "kSystemBackend"), default="kInProcessBackend"),
97100
BoolVariable("asserts", "Enable asserts (this flag is forced to 1 for debug=1)", False),
98101
BoolVariable("logging", "Enable Logging", False),
99102
EnumVariable("arch", "Target Architecture. The x86_32 and x86_64 targets can only be used with neon=0 and opencl=1.", "armv7a",
@@ -255,6 +258,27 @@ if not 'windows' in env['os']:
255258
if env['profile']:
256259
env.Append(CXXFLAGS = ['-std=c++17', '-DACL_PROFILE_ENABLE'])
257260
env.Append(CXXFLAGS = ['-DACL_PROFILE_LEVEL=%d' % int(env['profile_level'])])
261+
env.Append(CXXFLAGS = ['-DACL_PROFILE_BACKEND=%s' % env['profile_backend']])
262+
env.Append(CXXFLAGS = ['-DACL_PROFILE_MODE=%s' % env['profile_mode']])
263+
env.Append(CXXFLAGS = ['-DACL_ACL_PROFILE_SIZE_KB=%d' % int(env['profile_size'])])
264+
if env['profile_backend'] == 'perfetto':
265+
env.Append(CXXFLAGS = [
266+
'-std=c++17',
267+
'-Wno-switch-default',
268+
'-Wno-effc++',
269+
'-Wno-strict-overflow',
270+
'-Wno-noexcept',
271+
'-Wno-error=noexcept',
272+
'-Wno-error=strict-aliasing',
273+
'-Wno-error=class-memaccess',
274+
'-Wno-error=maybe-uninitialized',
275+
'-Wno-format-nonliteral',
276+
'-Wno-error=redundant-move',
277+
'-Wno-error=logical-op'])
278+
if env['opencl']:
279+
env.Append(CXXFLAGS = ['-DARM_COMPUTE_CL'])
280+
if env['os'] == 'android':
281+
env.Append(LINKFLAGS = ['-llog'])
258282

259283
cpp_tool = {'linux': 'g++', 'android' : 'clang++',
260284
'tizen': 'g++', 'macos':'clang++',

filelist.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,10 @@
7979
"src/runtime/CPP/functions/CPPTopKV.cpp",
8080
"src/runtime/CPP/functions/CPPUpsample.cpp"
8181
],
82+
"profiling": [
83+
"third_party/perfetto/perfetto.cc",
84+
"src/common/utils/profile/acl_profile.cpp"
85+
],
8286
"logging": [
8387
"src/core/utils/logging/FilePrinter.cpp",
8488
"src/core/utils/logging/Helpers.cpp",
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
/*
2+
* Copyright (c) 2025 Arm Limited.
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to
8+
* deal in the Software without restriction, including without limitation the
9+
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10+
* sell copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*/
24+
#if defined(ACL_PROFILE_ENABLE) && (ACL_PROFILE_BACKEND == PERFETTO)
25+
#include "src/common/utils/profile/acl_profile.h"
26+
27+
#include <fstream>
28+
29+
PERFETTO_TRACK_EVENT_STATIC_STORAGE();
30+
namespace arm_compute
31+
{
32+
namespace profile
33+
{
34+
35+
PerfettoProfiler::PerfettoProfiler()
36+
: tracing_session(nullptr),
37+
trace_start_ns(perfetto::TrackEvent::GetTraceTimeNs())
38+
#ifdef ARM_COMPUTE_CL
39+
,
40+
opencl_clock(nullptr),
41+
opencl_tracing_enabled(false)
42+
#else
43+
#endif
44+
{
45+
perfetto::TracingInitArgs args;
46+
args.backends = perfetto::ACL_PROFILE_MODE;
47+
perfetto::Tracing::Initialize(args);
48+
perfetto::TrackEvent::Register();
49+
perfetto::TraceConfig cfg;
50+
cfg.add_buffers()->set_size_kb(ACL_ACL_PROFILE_SIZE_KB);
51+
cfg.add_data_sources()->mutable_config()->set_name("track_event");
52+
tracing_session = perfetto::Tracing::NewTrace();
53+
tracing_session->Setup(cfg);
54+
tracing_session->StartBlocking();
55+
}
56+
57+
PerfettoProfiler::~PerfettoProfiler()
58+
{
59+
if (tracing_session)
60+
{
61+
perfetto::TrackEvent::Flush();
62+
tracing_session->StopBlocking();
63+
auto data = tracing_session->ReadTraceBlocking();
64+
std::ofstream out("acl.pftrace", std::ios::binary);
65+
out.write(data.data(), data.size());
66+
out.close();
67+
}
68+
}
69+
70+
#ifdef ARM_COMPUTE_CL
71+
void PerfettoProfiler::openclTraceBegin()
72+
{
73+
// Lock the process to ensure that the tracing session is created only once
74+
if (!opencl_tracing_enabled && ACL_PROFILE_LEVEL > 0)
75+
{
76+
#ifdef ARM_COMPUTE_CL
77+
opencl_clock = std::make_unique<OpenCLClock<true>>(ScaleFactor::NONE);
78+
if (!opencl_clock)
79+
{
80+
std::cerr << "Failed to create OpenCLClock instance." << std::endl;
81+
}
82+
#endif
83+
opencl_clock->test_start();
84+
opencl_clock->start();
85+
opencl_tracing_enabled = true;
86+
}
87+
}
88+
89+
void PerfettoProfiler::openclTraceEnd()
90+
{
91+
uint64_t cpu_sync_time = getTsNs();
92+
ARM_COMPUTE_TRACE_CUSTOM_EVENT_END(ARM_COMPUTE_PROF_CAT_SCHEDULER, 0, cpu_sync_time);
93+
if (!opencl_clock || !opencl_tracing_enabled || ACL_PROFILE_LEVEL < 1)
94+
{
95+
return;
96+
}
97+
opencl_clock->stop();
98+
opencl_clock->test_stop();
99+
100+
#if (ACL_PROFILE_LEVEL > 1)
101+
// Print the RAW GPU timestamps
102+
std::cout << "RAW GPU timestamps:" << std::endl;
103+
for (const auto &instrument : opencl_clock->measurements())
104+
{
105+
std::cout << instrument.first << ": " << instrument.second << std::endl;
106+
}
107+
#endif
108+
109+
// The difference between the instrument map and this map is that.
110+
// MeasurementsMap elements does have an awareness of the timestamps in other GPU stages.
111+
// Gathering all the timestamps stages in the value of the map makes drawing spans in the CPU timeline easier.
112+
// |-------------------+----------------------------------------------------------------|
113+
// | Map Name | Key | Value |
114+
// |-------------------+--------------------------+-------------------------------------|
115+
// | MeasurementsMap | [stage][kernel]#ID | GPU timestamp as string |
116+
// | | (e.g., "[start]foo#1" ) | (e.g., "123456789 ns") |
117+
// |-------------------+--------------------------+-------------------------------------|
118+
// | gpu_spans_map | [kernel]#ID | vector of stage timestamps (CPU ns) |
119+
// | | (e.g., "foo#1" ) | [queued, flushed, start, end] |
120+
// |-------------------+--------------------------+-------------------------------------|
121+
std::map<std::string, std::vector<uint64_t>> gpu_spans_map;
122+
123+
uint64_t gpu_sync_time = 0;
124+
// TODO : find a better way to sync GPU and CPU times
125+
// Here we are finding the GPU timestamp that have the highest value.
126+
// This is the closest to the end of ::sync() call.
127+
for (const auto &instrument : opencl_clock->measurements())
128+
{
129+
uint64_t gpu_ts = std::stoull(instrument.second.value().to_string());
130+
if (gpu_ts > gpu_sync_time)
131+
{
132+
gpu_sync_time = gpu_ts;
133+
}
134+
}
135+
136+
for (const auto &instrument : opencl_clock->measurements())
137+
{
138+
const std::string &key = instrument.first;
139+
const std::string time_str = instrument.second.value().to_string();
140+
uint64_t gpu_time = std::stoull(time_str);
141+
uint64_t cpu_time = gpu_time + cpu_sync_time - gpu_sync_time;
142+
143+
if (key.empty() || key[0] != '[')
144+
continue;
145+
146+
// Find the closing bracket
147+
size_t end_bracket = key.find(']');
148+
if (end_bracket == std::string::npos)
149+
continue;
150+
151+
std::string stage = key.substr(1, end_bracket - 1);
152+
std::string kernel = key.substr(end_bracket + 1);
153+
154+
int index = -1;
155+
if (stage == "queued")
156+
index = 0;
157+
else if (stage == "flushed")
158+
index = 1;
159+
else if (stage == "start")
160+
index = 2;
161+
else if (stage == "end")
162+
index = 3;
163+
else
164+
continue;
165+
166+
auto &vec = gpu_spans_map[kernel];
167+
if (vec.size() < 4)
168+
vec.resize(4, 0);
169+
vec[index] = cpu_time;
170+
}
171+
172+
for (auto &instrument : gpu_spans_map)
173+
{
174+
#if (ACL_PROFILE_LEVEL > 1)
175+
std::cout << "Kernel: " << instrument.first << std::endl;
176+
std::cout << "Queued: " << instrument.second[0] << " ns" << std::endl;
177+
std::cout << "Flushed: " << instrument.second[1] << " ns" << std::endl;
178+
std::cout << "Start: " << instrument.second[2] << " ns" << std::endl;
179+
std::cout << "End: " << instrument.second[3] << " ns" << std::endl;
180+
std::cout << std::endl;
181+
#endif
182+
183+
ARM_COMPUTE_TRACE_CUSTOM_EVENT(ARM_COMPUTE_PROF_CAT_GPU, ARM_COMPUTE_PROF_LVL_GPU, instrument.second[0],
184+
instrument.second[1] - instrument.second[0], "GPU::Queue",
185+
instrument.first.c_str());
186+
ARM_COMPUTE_TRACE_CUSTOM_EVENT(ARM_COMPUTE_PROF_CAT_GPU, ARM_COMPUTE_PROF_LVL_GPU, instrument.second[1],
187+
instrument.second[2] - instrument.second[1], "GPU::Flush",
188+
instrument.first.c_str());
189+
ARM_COMPUTE_TRACE_CUSTOM_EVENT(ARM_COMPUTE_PROF_CAT_GPU, ARM_COMPUTE_PROF_LVL_GPU, instrument.second[2],
190+
instrument.second[3] - instrument.second[2], "GPU::Run",
191+
instrument.first.c_str());
192+
}
193+
opencl_clock.reset();
194+
opencl_tracing_enabled = false;
195+
}
196+
#endif
197+
uint64_t PerfettoProfiler::getTsNs() const
198+
{
199+
return perfetto::TrackEvent::GetTraceTimeNs() - trace_start_ns;
200+
}
201+
202+
static PerfettoProfiler acl_perfetto;
203+
204+
PerfettoProfiler &get_profiler()
205+
{
206+
return acl_perfetto;
207+
}
208+
209+
} // namespace profile
210+
} // namespace arm_compute
211+
212+
#endif

0 commit comments

Comments
 (0)