Skip to content

Commit 69d8c01

Browse files
authored
Util UI for allocated memory (#6878)
UI fix to include the mem allocation and mem bandwidth for gpu Signed-off-by: Yang Wang <[email protected]>
1 parent f818687 commit 69d8c01

File tree

8 files changed

+94
-27
lines changed

8 files changed

+94
-27
lines changed

torchci/clickhouse_queries/oss_ci_util/oss_ci_list_util_stats/query.sql

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,18 @@ WITH aggregate_data AS (
55
max(JSONExtractFloat(json_data, 'cpu','max')) as cpu_max,
66
max(JSONExtractFloat(json_data, 'memory','max')) as memory_max,
77
max(arrayMax(arrayMap(x->JSONExtractFloat(x,'util_percent','max'),JSONExtractArrayRaw(json_data,'gpu_usage')))) as gpu_max,
8-
max(arrayMax(arrayMap(x->JSONExtractFloat(x,'mem_util_percent','max'),JSONExtractArrayRaw(json_data,'gpu_usage')))) as gpu_mem_max,
8+
max(arrayMax(arrayMap(x->JSONExtractFloat(x,'mem_util_percent','max'),JSONExtractArrayRaw(json_data,'gpu_usage')))) as gpu_mem_bandwidth_max,
9+
max(arrayMax(arrayMap(x -> JSONExtractFloat(x, 'allocated_mem_percent', 'max'),JSONExtractArrayRaw(json_data, 'gpu_usage'))))as gpu_allocated_mem_max,
910
avg(JSONExtractFloat(json_data, 'cpu','max')) as cpu_avg,
1011
avg(JSONExtractFloat(json_data, 'memory','max')) as memory_avg,
1112
avg(arrayAvg(arrayMap(x->JSONExtractFloat(x,'util_percent','max'),JSONExtractArrayRaw(json_data,'gpu_usage')))) as gpu_avg,
12-
avg(arrayAvg(arrayMap(x->JSONExtractFloat(x,'mem_util_percent','max'),JSONExtractArrayRaw(json_data,'gpu_usage')))) as gpu_mem_avg,
13+
avg(arrayAvg(arrayMap(x->JSONExtractFloat(x,'mem_util_percent','max'),JSONExtractArrayRaw(json_data,'gpu_usage')))) as gpu_mem_bandwidth_avg,
14+
avg(arrayAvg(arrayFilter(x -> x IS NOT NULL,arrayMap(x->JSONExtractFloat(x,'allocated_mem_percent','max'),JSONExtractArrayRaw(json_data,'gpu_usage'))))) as gpu_allocated_mem_avg,
1315
quantile(0.9)(JSONExtractFloat(json_data, 'cpu','max')) AS cpu_p90,
1416
quantile(0.9)(JSONExtractFloat(json_data, 'memory','max')) AS memory_p90,
1517
quantile(0.9)(arrayMax(arrayMap(x->JSONExtractFloat(x,'util_percent','max'),JSONExtractArrayRaw(json_data,'gpu_usage')))) as gpu_p90,
16-
quantile(0.9)(arrayMax(arrayMap(x->JSONExtractFloat(x,'mem_util_percent','max'),JSONExtractArrayRaw(json_data,'gpu_usage')))) as gpu_mem_p90
18+
quantile(0.9)(arrayMax(arrayMap(x->JSONExtractFloat(x,'mem_util_percent','max'),JSONExtractArrayRaw(json_data,'gpu_usage')))) as gpu_mem_bandwidth_p90,
19+
quantile(0.9)(arrayMax(arrayFilter(x -> x IS NOT NULL,arrayMap(x->JSONExtractFloat(x,'allocated_mem_percent','max'),JSONExtractArrayRaw(json_data,'gpu_usage'))))) as gpu_allocated_mem_p90
1720
FROM
1821
misc.oss_ci_time_series
1922
WHERE
@@ -39,12 +42,15 @@ SELECT
3942
a.memory_avg,
4043
a.gpu_max,
4144
a.gpu_avg,
42-
a.gpu_mem_max,
43-
a.gpu_mem_avg,
45+
a.gpu_mem_bandwidth_max,
46+
a.gpu_mem_bandwidth_avg,
4447
a.cpu_p90,
4548
a.memory_p90,
46-
a.gpu_mem_p90,
47-
a.gpu_p90
49+
a.gpu_mem_bandwidth_p90,
50+
a.gpu_p90,
51+
a.gpu_allocated_mem_max,
52+
a.gpu_allocated_mem_avg,
53+
a.gpu_allocated_mem_p90
4854
FROM
4955
misc.oss_ci_utilization_metadata o
5056
JOIN aggregate_data a ON a.job_id = o.job_id AND a.run_attempt = o.run_attempt

torchci/components/utilization/JobUtilizationPage/JobUtilizationPage.tsx

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ export const lineFilters: PickerConfig[] = [
1818
category: "gpu",
1919
types: [
2020
{ name: "gpu util", tags: ["gpu", "|util_percent"] },
21-
{ name: "gpu mem", tags: ["gpu", "|mem_util_percent"] },
21+
{ name: "gpu mem bw", tags: ["gpu", "|mem_util_percent"] },
22+
{ name: "gpu allocated mem", tags: ["gpu", "|allocated_mem_percent"] },
2223
],
2324
},
2425
{ category: "cpu", types: [{ name: "cpu", tags: ["cpu"] }] },
@@ -45,6 +46,7 @@ export const JobUtilizationPage = ({
4546
// this makes sense for utilization to detect potential effieciency issues, later our ui
4647
// can support other aggregation methods for analysis, it's very disruptive to add both in UI right now.
4748
const aggregateType = "max";
49+
const skipLineTypes = ["_mem_value", "total_mem"];
4850

4951
useEffect(() => {
5052
if (!data) {
@@ -55,9 +57,15 @@ export const JobUtilizationPage = ({
5557
const lines = data.ts_list;
5658

5759
// currently we only show data that is aggregated by max value during the time interval
58-
const filteredLines = lines.filter((line) =>
59-
line.id.includes(aggregateType)
60-
);
60+
const filteredLines = lines.filter((line) => {
61+
const skiplineType = skipLineTypes.find((skipLineType) => {
62+
return line.id.includes(skipLineType);
63+
});
64+
if (skiplineType) {
65+
return false;
66+
}
67+
return line.id.includes(aggregateType);
68+
});
6169

6270
const jobStats: StatsInfo[] = processStatsData(filteredLines);
6371

torchci/components/utilization/JobUtilizationPage/helper.ts

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,12 @@ export function calculateAverage(data: number[]) {
257257
function getAllGpusStats(stats: StatsInfo[]) {
258258
// get all gpus stats for the test
259259
const gpuUtils = stats.filter((item) => item.id.includes("|util_percent"));
260-
const gpuMems = stats.filter((item) => item.id.includes("|mem_util_percent"));
260+
const gpuMemBWs = stats.filter((item) =>
261+
item.id.includes("|mem_util_percent")
262+
);
263+
const gpuAllocMems = stats.filter((item) =>
264+
item.id.includes("|allocated_mem")
265+
);
261266

262267
if (gpuUtils.length == 0) {
263268
return [];
@@ -284,18 +289,40 @@ function getAllGpusStats(stats: StatsInfo[]) {
284289
],
285290
},
286291
{
287-
name: "gpu_mem_all",
288-
id: "gpu_mem_all",
292+
name: "gpu_mem_bw_all",
293+
id: "gpu_mem_bw_all",
289294
columns: [
290-
aggregateStats(gpuMems, StatType.Average, AgggregateMethod.Average),
291-
aggregateStats(gpuMems, StatType.Max, AgggregateMethod.Max),
295+
aggregateStats(gpuMemBWs, StatType.Average, AgggregateMethod.Average),
296+
aggregateStats(gpuMemBWs, StatType.Max, AgggregateMethod.Max),
292297
aggregateStats(
293-
gpuMems,
298+
gpuMemBWs,
294299
StatType.SpikeFrequency,
295300
AgggregateMethod.Average
296301
),
297302
aggregateStats(
298-
gpuMems,
303+
gpuMemBWs,
304+
StatType.SpikeAvgInterval,
305+
AgggregateMethod.Max
306+
),
307+
],
308+
},
309+
{
310+
name: "gpu_allocated_mem_all",
311+
id: "gpu_allocated_mem_all",
312+
columns: [
313+
aggregateStats(
314+
gpuAllocMems,
315+
StatType.Average,
316+
AgggregateMethod.Average
317+
),
318+
aggregateStats(gpuAllocMems, StatType.Max, AgggregateMethod.Max),
319+
aggregateStats(
320+
gpuAllocMems,
321+
StatType.SpikeFrequency,
322+
AgggregateMethod.Average
323+
),
324+
aggregateStats(
325+
gpuAllocMems,
299326
StatType.SpikeAvgInterval,
300327
AgggregateMethod.Max
301328
),

torchci/components/utilization/WorkflowUtilizationPage/WorkflowUtilizationPage.tsx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ const WorkflowUtilization = () => {
4848
};
4949
});
5050

51+
console.log(rows);
52+
5153
const columns: any[] = [
5254
{ field: "name", headerName: "Job Name", width: 400 },
5355
{ field: "id", headerName: "Job id", width: 120 },

torchci/components/utilization/components/TestSectionView/RankTestView/RankTestView.tsx

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,12 @@ const DefaultGpuResourceValue = [
3535
value: "gpus_util_all",
3636
},
3737
{
38-
name: "all gpu memory",
39-
value: "gpu_mem_all",
38+
name: "all gpu memory bandwidth",
39+
value: "gpu_mem_bw_all",
40+
},
41+
{
42+
name: "all gpu allocated memory ",
43+
value: "gpu_allocated_mem_all",
4044
},
4145
];
4246

torchci/lib/utilization/fetchListUtilizationMetadataInfo.ts

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,24 @@ function toUtilizationStats(metadata: any) {
9595
gpu_max: metadata.gpu_count ? metadata.gpu_max : undefined,
9696
gpu_avg: metadata.gpu_count ? metadata.gpu_avg : undefined,
9797
gpu_p90: metadata.gpu_count ? metadata.gpu_p90 : undefined,
98-
gpu_memory_max: metadata.gpu_count ? metadata.gpu_mem_max : undefined,
99-
gpu_memory_avg: metadata.gpu_count ? metadata.gpu_mem_avg : undefined,
100-
gpu_memmory_p90: metadata.gpu_count ? metadata.gpu_mem_p90 : undefined,
98+
gpu_memory_bandwidth_max: metadata.gpu_count
99+
? metadata.gpu_mem_bandwidth_max
100+
: undefined,
101+
gpu_memory_bandwidth_avg: metadata.gpu_count
102+
? metadata.gpu_mem_bandwidth_avg
103+
: undefined,
104+
gpu_memory_bandwidth_p90: metadata.gpu_count
105+
? metadata.gpu_mem_bandwidth_p90
106+
: undefined,
107+
gpu_allocated_memory_max: metadata.gpu_count
108+
? metadata.gpu_allocated_mem_max
109+
: undefined,
110+
gpu_allocated_memory_avg: metadata.gpu_count
111+
? metadata.gpu_allocated_mem_avg
112+
: undefined,
113+
gpu_allocated_memory_p90: metadata.gpu_count
114+
? metadata.gpu_allocated_mem_p90
115+
: undefined,
101116
has_gpu: metadata.gpu_count ? metadata.gpu_count > 0 : false,
102117
};
103118
return stats;

torchci/lib/utilization/fetchUtilization.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,9 @@ function getDisplayName(name: string) {
9999
}
100100
if (tags[0].toLowerCase().includes("gpu")) {
101101
if (name.includes("mem_util")) {
102-
return `gpu_${tags[1]}_mem`;
102+
return `gpu_${tags[1]}_mem_bandwidth`;
103+
} else if (name.includes("allocated_mem_percent")) {
104+
return `gpu_${tags[1]}_allocated_mem_percent`;
103105
}
104106
return `gpu_${tags[1]}_util`;
105107
}

torchci/lib/utilization/types.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,13 @@ export interface UtilizationAggreStats {
8585
memory_p90: number;
8686
gpu_max?: number;
8787
gpu_avg?: number;
88-
gpu_memory_max?: number;
89-
gpu_memory_avg?: number;
9088
gpu_p90?: number;
91-
gpu_memmory_p90?: number;
89+
gpu_memory_bandwidth_max?: number;
90+
gpu_memory_bandwidth_avg?: number;
91+
gpu_memory_bandwidth_p90?: number;
92+
gpu_allocated_memory_max?: number;
93+
gpu_allocated_memory_avg?: number;
94+
gpu_allocated_memory_p90?: number;
9295
}
9396

9497
/**

0 commit comments

Comments
 (0)