Skip to content

Commit ca8ecba

Browse files
committed
Add restructure perf metrics to TUI.
1 parent 7bab618 commit ca8ecba

File tree

9 files changed

+795
-30
lines changed

9 files changed

+795
-30
lines changed

projects/rocprofiler-compute/src/config.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
##############################################################################
2525

2626

27-
import re
2827
from pathlib import Path
2928

3029
# NB: Creating a new module to share global vars across modules

projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,13 @@
2525

2626

2727
import copy
28-
import os
2928
import sys
3029
import textwrap
31-
from abc import ABC, abstractmethod
30+
from abc import abstractmethod
3231
from collections import OrderedDict
3332
from pathlib import Path
3433

34+
import config
3535
from utils import file_io, parser, schema
3636
from utils.logger import console_debug, console_error, console_log, demarcate
3737
from utils.utils import is_workload_empty, merge_counters_spatial_multiplex
@@ -70,15 +70,22 @@ def generate_configs(self, arch, config_dir, list_stats, filter_metrics, sys_inf
7070
if list_stats:
7171
ac.panel_configs = file_io.top_stats_build_in_config
7272
else:
73-
arch_panel_config = (
73+
arch_panel_config = [
7474
config_dir if single_panel_config else config_dir.joinpath(arch)
75-
)
76-
ac.panel_configs = file_io.load_panel_configs(arch_panel_config, {})
75+
]
76+
# Use restructured perf metrics in TUI analyze mode
77+
if self.__args.tui and arch in ["gfx942", "gfx950"]:
78+
arch_panel_config.append(
79+
f"{config.rocprof_compute_home}/rocprof_compute_tui/utils/{arch}"
80+
)
81+
ac.panel_configs = file_io.load_panel_configs(arch_panel_config)
7782

7883
# TODO: filter_metrics should/might be one per arch
7984
# print(ac)
8085

81-
parser.build_dfs(archConfigs=ac, filter_metrics=filter_metrics, sys_info=sys_info)
86+
parser.build_dfs(
87+
archConfigs=ac, filter_metrics=filter_metrics, sys_info=sys_info
88+
)
8289
self._arch_configs[arch] = ac
8390
return self._arch_configs
8491

@@ -192,7 +199,9 @@ def initalize_runs(self, normalization_filter=None):
192199
arch = w.sys_info.iloc[0]["gpu_arch"]
193200
mspec = self.get_socs()[arch]._mspec
194201
if self.__args.specs_correction:
195-
w.sys_info = parser.correct_sys_info(mspec, self.__args.specs_correction)
202+
w.sys_info = parser.correct_sys_info(
203+
mspec, self.__args.specs_correction
204+
)
196205
w.avail_ips = w.sys_info["ip_blocks"].item().split("|")
197206
w.dfs = copy.deepcopy(self._arch_configs[arch].dfs)
198207
w.dfs_type = self._arch_configs[arch].dfs_type
@@ -266,7 +275,9 @@ def pre_processing(self):
266275
console_log("analysis", "deriving rocprofiler-compute metrics...")
267276
# initalize output file
268277
self._output = (
269-
open(self.__args.output_file, "w+") if self.__args.output_file else sys.stdout
278+
open(self.__args.output_file, "w+")
279+
if self.__args.output_file
280+
else sys.stdout
270281
)
271282

272283
# Read profiling config
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# TUI use only
2+
# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization
3+
Panel Config:
4+
id: 3200
5+
title: GPU Speed-of-Light
6+
metrics_description:
7+
Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
8+
been loaded from, stored to, or atomically updated in the LDS per unit time
9+
(see LDS Bandwidth example for more detail). This is also presented as a percent
10+
of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
11+
vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
12+
VMEM instructions per unit time. The number of bytes is calculated as the number
13+
of cache lines requested multiplied by the cache line size. This value does
14+
not consider partial requests, so e.g., if only a single value is requested
15+
in a cache line, the data movement will still be counted as a full cache line.
16+
This is also presented as a percent of the peak theoretical bandwidth achievable
17+
on the specific accelerator.
18+
L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
19+
number of bytes is calculated as the number of cache lines requested multiplied
20+
by the cache line size. This value does not consider partial requests, so e.g.,
21+
if only a single value is requested in a cache line, the data movement will
22+
still be counted as a full cache line. This is also presented as a percent of
23+
the peak theoretical bandwidth achievable on the specific accelerator.
24+
L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
25+
\ interface per unit time. This is also presented as a percent of the peak theoretical\
26+
\ bandwidth achievable on the specific accelerator."
27+
L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
28+
interface by write and atomic operations per unit time. This is also presented
29+
as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
30+
Kernel Time: The total duration of the executed kernel.
31+
Kernel Time (Cycles): The total duration of the executed kernel in cycles.
32+
SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
33+
on a CU was actively doing any work, summed over all CUs. Low values (less than
34+
100%) indicate that the accelerator was not fully saturated by the kernel, or
35+
a potential load-imbalance issue.
36+
Clock Rate:
37+
data source:
38+
- metric_table:
39+
id: 3201
40+
title: GPU Speed-of-Light
41+
header:
42+
metric: Metric
43+
value: Avg
44+
unit: Unit
45+
peak: Peak
46+
pop: Pct of Peak
47+
metric:
48+
Theoretical LDS Bandwidth:
49+
value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
50+
/ (End_Timestamp - Start_Timestamp)))
51+
unit: GB/s
52+
peak: (($max_sclk * $cu_per_gpu) * 0.128)
53+
pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
54+
/ (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128)))
55+
vL1D Cache BW:
56+
value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
57+
unit: GB/s
58+
peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
59+
pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
60+
- Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
61+
L2 Cache BW:
62+
value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
63+
unit: GB/s
64+
peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
65+
pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
66+
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
67+
L2-Fabric Read BW:
68+
value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
69+
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
70+
- Start_Timestamp))
71+
unit: GB/s
72+
peak: $hbmBandwidth
73+
pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
74+
- TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
75+
- Start_Timestamp)))) / $hbmBandwidth)
76+
L2-Fabric Write BW:
77+
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
78+
* 32)) / (End_Timestamp - Start_Timestamp)))
79+
unit: GB/s
80+
peak: $hbmBandwidth
81+
pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
82+
TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) /
83+
$hbmBandwidth)
84+
Kernel Time:
85+
avg: AVG((End_Timestamp - Start_Timestamp))
86+
unit: ns
87+
peak: None
88+
pop: None
89+
Kernel Time (Cycles):
90+
avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
91+
unit: Cycle
92+
peak: None
93+
pop: None
94+
SIMD Utilization:
95+
value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
96+
unit: Pct
97+
peak: 100
98+
pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
99+
Clock Rate:
100+
value: None
101+
unit: ns
102+
peak: None
103+
pop: None
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# TUI use only
2+
# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organizationPanel Config:
3+
id: 3300
4+
title: Compute Throughput
5+
metrics_description:
6+
VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
7+
This is also presented as a percent of the peak theoretical FLOPs achievable
8+
on the specific accelerator. Note: this does not include any floating-point
9+
operations from MFMA instructions.'
10+
VALU IOPs: 'The total integer operations executed per second on the VALU. This
11+
is also presented as a percent of the peak theoretical IOPs achievable on the
12+
specific accelerator. Note: this does not include any integer operations from
13+
MFMA instructions.'
14+
MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
15+
executed per second. This does not include any 16-bit brain floating point operations
16+
from VALU instructions. This is also presented as a percent of the peak theoretical
17+
F8 MFMA operations achievable on the specific accelerator. It is supported on
18+
AMD Instinct MI300 series and later only.
19+
MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
20+
executed per second. Note: this does not include any 16-bit brain floating point
21+
operations from VALU instructions. This is also presented as a percent of the
22+
peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
23+
MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
24+
per second. Note: this does not include any 16-bit floating point operations
25+
from VALU instructions. This is also presented as a percent of the peak theoretical
26+
F16 MFMA operations achievable on the specific accelerator.'
27+
MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
28+
per second. Note: this does not include any 32-bit floating point operations
29+
from VALU instructions. This is also presented as a percent of the peak theoretical
30+
F32 MFMA operations achievable on the specific accelerator.'
31+
MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
32+
per second. Note: this does not include any 64-bit floating point operations
33+
from VALU instructions. This is also presented as a percent of the peak theoretical
34+
F64 MFMA operations achievable on the specific accelerator.'
35+
MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
36+
per second. Note: this does not include any 8-bit integer operations from VALU
37+
instructions. This is also presented as a percent of the peak theoretical INT8
38+
MFMA operations achievable on the specific accelerator.'
39+
SALU Utilization: Indicates what percent of the kernel's duration the SALU was
40+
busy executing instructions. Computed as the ratio of the total number of cycles
41+
spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
42+
VALU Utilization: Indicates what percent of the kernel's duration the VALU was
43+
busy executing instructions. Does not include VMEM operations. Computed as the
44+
ratio of the total number of cycles spent by the scheduler issuing VALU instructions
45+
over the total CU cycles.
46+
MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
47+
was busy executing instructions. Computed as the ratio of the total number of
48+
cycles the MFMA was busy over the total CU cycles.
49+
VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
50+
was busy executing instructions, including both global/generic and spill/scratch
51+
operations (see the VMEM instruction count metrics) for more detail). Does not
52+
include VALU operations. Computed as the ratio of the total number of cycles
53+
spent by the scheduler issuing VMEM instructions over the total CU cycles.
54+
Branch Utilization: Indicates what percent of the kernel's duration the branch
55+
unit was busy executing instructions. Computed as the ratio of the total number
56+
of cycles spent by the scheduler issuing branch instructions over the total
57+
CU cycles
58+
IPC: The ratio of the total number of instructions executed on the CU over the
59+
total active CU cycles. This is also presented as a percent of the peak theoretical
60+
bandwidth achievable on the specific accelerator.
61+
data source:
62+
- metric_table:
63+
id: 3301
64+
title: Compute Throughput
65+
header:
66+
metric: Metric
67+
value: Avg
68+
unit: Unit
69+
peak: Peak
70+
pop: Pct of Peak
71+
metric:
72+
VALU FLOPs:
73+
value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) +
74+
SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
75+
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
76+
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
77+
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
78+
unit: GFLOP/s
79+
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
80+
pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
81+
+ SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
82+
+ SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
83+
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
84+
+ (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
85+
/ (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
86+
VALU IOPs:
87+
value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
88+
- Start_Timestamp)))
89+
unit: GIOP/s
90+
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
91+
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
92+
- Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
93+
MFMA FLOPs (F8):
94+
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
95+
unit: GFLOP/s
96+
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
97+
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp -
98+
Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
99+
MFMA FLOPs (BF16):
100+
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
101+
unit: GFLOP/s
102+
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
103+
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
104+
- Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
105+
MFMA FLOPs (F16):
106+
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
107+
unit: GFLOP/s
108+
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
109+
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp -
110+
Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
111+
MFMA FLOPs (F32):
112+
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
113+
unit: GFLOP/s
114+
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
115+
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp -
116+
Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
117+
MFMA FLOPs (F64):
118+
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
119+
unit: GFLOP/s
120+
peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
121+
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp -
122+
Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
123+
MFMA IOPs (Int8):
124+
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
125+
unit: GIOP/s
126+
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
127+
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp -
128+
Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
129+
SALU Utilization:
130+
value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
131+
unit: pct
132+
peak: 100
133+
pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
134+
VALU Utilization:
135+
value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
136+
unit: pct
137+
peak: 100
138+
pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
139+
MFMA Utilization:
140+
value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
141+
* $cu_per_gpu) * 4)))
142+
unit: pct
143+
peak: 100
144+
pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
145+
* $cu_per_gpu) * 4)))
146+
VMEM Utilization:
147+
value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
148+
/ $cu_per_gpu))
149+
unit: pct
150+
peak: 100
151+
pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
152+
/ $cu_per_gpu))
153+
Branch Utilization:
154+
value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
155+
unit: pct
156+
peak: 100
157+
pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
158+
IPC:
159+
value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
160+
unit: Instr/cycle
161+
peak: 5
162+
pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)

0 commit comments

Comments
 (0)