Skip to content

Commit e2c3372

Browse files
committed
Add multi-GPU track support
Add per-GPU grouping in the UI when a trace contains multiple GPUs. Previously, all GPU tracks (render stages, counters, frequency, etc.) were merged into a single flat "GPU" group regardless of how many physical GPUs existed. When multiple GPUs exist, separate top-level groups "GPU (1)", "GPU (2)", etc. are created. GPUs are enumerated sequentially rather than using the raw gpu_id from the trace (which may be a hash). Single-GPU traces look identical to before. Changes: - Add gpu dimension to render stage track blueprint so gpu_id is stored as a track dimension (matching GPU counters). - Add getOrCreateGpuGroup() to StandardGroupsPlugin to create top-level per-GPU groups. - Extract gpu_id from dimension_arg_set_id in TraceProcessorTrackPlugin queries and route GPU tracks to per-GPU groups. - Update GpuFreq plugin to use per-GPU grouping. Bug: #5097
1 parent bf1d72c commit e2c3372

File tree

5 files changed

+120
-23
lines changed

5 files changed

+120
-23
lines changed

src/trace_processor/importers/proto/gpu_event_parser.cc

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ using protos::pbzero::VulkanMemoryEvent;
115115
constexpr auto kRenderStageBlueprint = TrackCompressor::SliceBlueprint(
116116
"gpu_render_stage",
117117
tracks::DimensionBlueprints(
118+
tracks::kGpuDimensionBlueprint,
118119
tracks::StringDimensionBlueprint("render_stage_source"),
119120
tracks::UintDimensionBlueprint("hwqueue_id"),
120121
tracks::StringIdDimensionBlueprint("hwqueue_name")),
@@ -283,6 +284,7 @@ StringId GpuEventParser::GetFullStageName(
283284
}
284285

285286
void GpuEventParser::InsertTrackForUninternedRenderStage(
287+
uint32_t gpu_id,
286288
uint32_t hw_queue_id,
287289
const GpuRenderStageEvent::Specifications::Description::Decoder& hw_queue) {
288290
if (!hw_queue.has_name()) {
@@ -314,7 +316,7 @@ void GpuEventParser::InsertTrackForUninternedRenderStage(
314316

315317
auto factory = context_->track_compressor->CreateTrackFactory(
316318
kRenderStageBlueprint,
317-
tracks::Dimensions("id", hw_queue_id, kNullStringId),
319+
tracks::Dimensions(gpu_id, "id", hw_queue_id, kNullStringId),
318320
tracks::DynamicName(name),
319321
[&, this](ArgsTracker::BoundInserter& inserter) {
320322
inserter.AddArg(description_id_, Variadic::String(description));
@@ -379,12 +381,14 @@ void GpuEventParser::ParseGpuRenderStageEvent(
379381
GpuRenderStageEvent::Decoder event(blob);
380382

381383
int32_t pid = 0;
384+
uint32_t gpu_id =
385+
event.has_gpu_id() ? static_cast<uint32_t>(event.gpu_id()) : 0;
382386
if (event.has_specifications()) {
383387
GpuRenderStageEvent::Specifications::Decoder spec(event.specifications());
384388
uint32_t hw_queue_id = 0;
385389
for (auto it = spec.hw_queue(); it; ++it) {
386390
GpuRenderStageEvent::Specifications::Description::Decoder hw_queue(*it);
387-
InsertTrackForUninternedRenderStage(hw_queue_id++, hw_queue);
391+
InsertTrackForUninternedRenderStage(gpu_id, hw_queue_id++, hw_queue);
388392
}
389393
for (auto it = spec.stage(); it; ++it) {
390394
GpuRenderStageEvent::Specifications::Description::Decoder stage(*it);
@@ -477,7 +481,7 @@ void GpuEventParser::ParseGpuRenderStageEvent(
477481
: kNullStringId;
478482
TrackId track_id = context_->track_compressor->InternScoped(
479483
kRenderStageBlueprint,
480-
tracks::Dimensions(base::StringView(source),
484+
tracks::Dimensions(gpu_id, base::StringView(source),
481485
static_cast<uint32_t>(hw_queue_id), dimension_name),
482486
ts, static_cast<int64_t>(event.duration()),
483487
tracks::DynamicName(track_name),

src/trace_processor/importers/proto/gpu_event_parser.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ class GpuEventParser {
8484
PacketSequenceStateGeneration* sequence_state,
8585
const protos::pbzero::GpuRenderStageEvent_Decoder& event) const;
8686
void InsertTrackForUninternedRenderStage(
87+
uint32_t gpu_id,
8788
uint32_t id,
8889
const protos::pbzero::GpuRenderStageEvent::Specifications::Description::
8990
Decoder&);

ui/src/plugins/dev.perfetto.GpuFreq/index.ts

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -44,25 +44,29 @@ export default class implements PerfettoPlugin {
4444

4545
if (tracks.length === 0) return;
4646

47-
const gpuGroup = ctx.plugins
48-
.getPlugin(StandardGroupsPlugin)
49-
.getOrCreateStandardGroup(ctx.defaultWorkspace, 'GPU');
47+
const standardGroupsPlugin = ctx.plugins.getPlugin(StandardGroupsPlugin);
5048

51-
// Only create a sub-group if there's more than one track.
52-
let parent: TrackNode;
53-
if (tracks.length > 1) {
54-
parent = new TrackNode({
55-
name: 'GPU Frequency',
56-
isSummary: true,
57-
});
58-
gpuGroup.addChildInOrder(parent);
59-
} else {
60-
parent = gpuGroup;
49+
// Query distinct GPU IDs across all GPU tracks and build a mapping from
50+
// raw gpu_id to a 1-based enumerated index.
51+
const gpuIdsResult = await ctx.engine.query(`
52+
select distinct extract_arg(dimension_arg_set_id, 'gpu') as gpu_id
53+
from track
54+
where extract_arg(dimension_arg_set_id, 'gpu') is not null
55+
order by gpu_id
56+
`);
57+
const gpuIdToIndex = new Map<number, number>();
58+
const gpuIt = gpuIdsResult.iter({gpu_id: NUM});
59+
let idx = 1;
60+
for (; gpuIt.valid(); gpuIt.next()) {
61+
gpuIdToIndex.set(gpuIt.gpu_id, idx++);
6162
}
63+
const gpuCount = gpuIdToIndex.size;
6264

6365
for (const {id, gpuId, unit} of tracks) {
66+
const gpuIndex = gpuIdToIndex.get(gpuId) ?? 1;
6467
const uri = `/gpu_frequency_${gpuId}`;
65-
const name = `Gpu ${gpuId} Frequency`;
68+
const name =
69+
gpuCount > 1 ? `GPU (${gpuIndex}) Frequency` : 'GPU Frequency';
6670
ctx.tracks.registerTrack({
6771
uri,
6872
tags: {
@@ -77,8 +81,16 @@ export default class implements PerfettoPlugin {
7781
name,
7882
),
7983
});
84+
85+
// Determine the parent group for this frequency track.
86+
const gpuGroup = standardGroupsPlugin.getOrCreateGpuGroup(
87+
ctx.defaultWorkspace,
88+
gpuIndex,
89+
gpuCount,
90+
);
91+
8092
const track = new TrackNode({uri, name, sortOrder: -20});
81-
parent.addChildInOrder(track);
93+
gpuGroup.addChildInOrder(track);
8294
}
8395
}
8496
}

ui/src/plugins/dev.perfetto.StandardGroups/index.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ export default class implements PerfettoPlugin {
5050
HYPERVISOR: makeGroupNode('Hypervisor'),
5151
};
5252

53+
private readonly gpuSubGroups = new Map<number, TrackNode>();
54+
5355
async onTraceLoad() {}
5456

5557
/**
@@ -70,6 +72,37 @@ export default class implements PerfettoPlugin {
7072

7173
return node;
7274
}
75+
76+
/**
77+
* Gets or creates a per-GPU group for multi-GPU traces.
78+
*
79+
* When gpuCount <= 1, returns the standard "GPU" group (no change).
80+
* When gpuCount > 1, creates/returns a top-level "GPU (N)" group where N
81+
* is a 1-based enumerated index (not the raw gpu_id from the trace).
82+
*
83+
* @param workspace - The workspace on which to create the group.
84+
* @param gpuIndex - 1-based enumerated GPU index.
85+
* @param gpuCount - Total number of distinct GPUs in the trace.
86+
*/
87+
getOrCreateGpuGroup(
88+
workspace: Workspace,
89+
gpuIndex: number,
90+
gpuCount: number,
91+
): TrackNode {
92+
if (gpuCount <= 1) {
93+
return this.getOrCreateStandardGroup(workspace, 'GPU');
94+
}
95+
96+
const existing = this.gpuSubGroups.get(gpuIndex);
97+
if (existing) {
98+
return existing;
99+
}
100+
101+
const group = makeGroupNode(`GPU (${gpuIndex})`);
102+
workspace.addChildInOrder(group);
103+
this.gpuSubGroups.set(gpuIndex, group);
104+
return group;
105+
}
73106
}
74107

75108
function makeGroupNode(name: string, collapsed = true) {

ui/src/plugins/dev.perfetto.TraceProcessorTrack/index.ts

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
9292

9393
private groups = new Map<string, TrackNode>();
9494
private store?: Store<TraceProcessorTrackPluginState>;
95+
private gpuCount = 0;
96+
private gpuIdToIndex = new Map<number, number>();
9597

9698
private migrateTraceProcessorTrackPluginState(
9799
init: unknown,
@@ -104,6 +106,23 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
104106
this.store = ctx.mountStore(TraceProcessorTrackPlugin.id, (init) =>
105107
this.migrateTraceProcessorTrackPluginState(init),
106108
);
109+
110+
// Query distinct GPU IDs in the trace and build a mapping from raw gpu_id
111+
// to a 1-based enumerated index. The raw gpu_id may be a hash or arbitrary
112+
// value, so we enumerate sequentially for display purposes.
113+
const gpuIdsResult = await ctx.engine.query(`
114+
select distinct extract_arg(dimension_arg_set_id, 'gpu') as gpu_id
115+
from track
116+
where extract_arg(dimension_arg_set_id, 'gpu') is not null
117+
order by gpu_id
118+
`);
119+
const gpuIt = gpuIdsResult.iter({gpu_id: NUM});
120+
let gpuIndex = 1;
121+
for (; gpuIt.valid(); gpuIt.next()) {
122+
this.gpuIdToIndex.set(gpuIt.gpu_id, gpuIndex++);
123+
}
124+
this.gpuCount = this.gpuIdToIndex.size;
125+
107126
await this.addCounters(ctx);
108127
await this.addSlices(ctx);
109128
this.addAggregations(ctx);
@@ -124,6 +143,7 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
124143
ct.machine_id as machine,
125144
extract_arg(ct.dimension_arg_set_id, 'utid') as utid,
126145
extract_arg(ct.dimension_arg_set_id, 'upid') as upid,
146+
extract_arg(ct.dimension_arg_set_id, 'gpu') as gpu_id,
127147
extract_arg(ct.source_arg_set_id, 'description') as description
128148
from counter_track ct
129149
join _counter_track_summary using (id)
@@ -153,6 +173,7 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
153173
unit: STR_NULL,
154174
utid: NUM_NULL,
155175
upid: NUM_NULL,
176+
gpu_id: NUM_NULL,
156177
threadName: STR_NULL,
157178
processName: STR_NULL,
158179
tid: LONG_NULL,
@@ -170,6 +191,7 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
170191
unit,
171192
utid,
172193
upid,
194+
gpu_id: gpuId,
173195
threadName,
174196
processName,
175197
tid,
@@ -232,6 +254,7 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
232254
group,
233255
upid,
234256
utid,
257+
gpuId,
235258
new TrackNode({
236259
uri,
237260
name: trackName,
@@ -264,6 +287,7 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
264287
lower(min(t.name)) as lower_name,
265288
extract_arg(t.dimension_arg_set_id, 'utid') as utid,
266289
extract_arg(t.dimension_arg_set_id, 'upid') as upid,
290+
extract_arg(t.dimension_arg_set_id, 'gpu') as gpu_id,
267291
extract_arg(t.source_arg_set_id, 'description') as description,
268292
min(t.id) minTrackId,
269293
group_concat(t.id) as trackIds,
@@ -277,13 +301,14 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
277301
from _slice_track_summary s
278302
join track t using (id)
279303
left join _track_event_tracks_with_callstacks cs on cs.track_id = t.id
280-
group by type, upid, utid, t.track_group_id, ifnull(t.track_group_id, t.id)
304+
group by type, upid, utid, gpu_id, t.track_group_id, ifnull(t.track_group_id, t.id)
281305
)
282306
select
283307
s.type,
284308
s.name,
285309
s.utid,
286310
ifnull(s.upid, tp.upid) as upid,
311+
s.gpu_id,
287312
s.minTrackId as minTrackId,
288313
s.trackIds as trackIds,
289314
s.trackCount,
@@ -330,6 +355,7 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
330355
name: STR_NULL,
331356
utid: NUM_NULL,
332357
upid: NUM_NULL,
358+
gpu_id: NUM_NULL,
333359
trackIds: STR,
334360
maxDepth: NUM,
335361
tid: LONG_NULL,
@@ -351,6 +377,7 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
351377
maxDepth,
352378
utid,
353379
upid,
380+
gpu_id: gpuId,
354381
threadName,
355382
processName,
356383
tid,
@@ -417,6 +444,7 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
417444
group,
418445
upid,
419446
utid,
447+
gpuId,
420448

421449
new TrackNode({
422450
uri,
@@ -436,6 +464,7 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
436464
group: string | TrackGroupSchema | undefined,
437465
upid: number | null,
438466
utid: number | null,
467+
gpuId: number | null,
439468
track: TrackNode,
440469
) {
441470
switch (topLevelGroup) {
@@ -466,10 +495,28 @@ export default class TraceProcessorTrackPlugin implements PerfettoPlugin {
466495
break;
467496
}
468497
default: {
469-
const standardGroup = ctx.plugins
470-
.getPlugin(StandardGroupsPlugin)
471-
.getOrCreateStandardGroup(ctx.defaultWorkspace, topLevelGroup);
472-
this.getGroupByName(standardGroup, group, null).addChildInOrder(track);
498+
const standardGroupsPlugin =
499+
ctx.plugins.getPlugin(StandardGroupsPlugin);
500+
501+
// For GPU tracks with a gpu_id, use per-GPU groups when multiple
502+
// GPUs exist in the trace.
503+
if (topLevelGroup === 'GPU' && gpuId !== null) {
504+
const gpuIndex = this.gpuIdToIndex.get(gpuId) ?? 1;
505+
const gpuGroup = standardGroupsPlugin.getOrCreateGpuGroup(
506+
ctx.defaultWorkspace,
507+
gpuIndex,
508+
this.gpuCount,
509+
);
510+
this.getGroupByName(gpuGroup, group, gpuId).addChildInOrder(track);
511+
} else {
512+
const standardGroup = standardGroupsPlugin.getOrCreateStandardGroup(
513+
ctx.defaultWorkspace,
514+
topLevelGroup,
515+
);
516+
this.getGroupByName(standardGroup, group, null).addChildInOrder(
517+
track,
518+
);
519+
}
473520
break;
474521
}
475522
}

0 commit comments

Comments
 (0)