ROCm
diff --git a/‎projects/rocprofiler-compute/src/config.py‎
Lines changed: 0 additions & 1 deletion b/‎projects/rocprofiler-compute/src/config.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py‎
Lines changed: 19 additions & 8 deletions b/‎projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3200_gpu_speed_of_light.yaml‎
Lines changed: 103 additions & 0 deletions b/‎projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3200_gpu_speed_of_light.yaml‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3300_compute_throughput.yaml‎
Lines changed: 162 additions & 0 deletions b/‎projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942/3300_compute_throughput.yaml‎
Lines changed: 162 additions & 0 deletions
@@ -24,7 +24,6 @@
 ##############################################################################
 
 
-import re
 from pathlib import Path
 
 # NB: Creating a new module to share global vars across modules
 
@@ -25,13 +25,13 @@
 
 
 import copy
-import os
 import sys
 import textwrap
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from collections import OrderedDict
 from pathlib import Path
 
+import config
 from utils import file_io, parser, schema
 from utils.logger import console_debug, console_error, console_log, demarcate
 from utils.utils import is_workload_empty, merge_counters_spatial_multiplex
@@ -70,15 +70,22 @@ def generate_configs(self, arch, config_dir, list_stats, filter_metrics, sys_inf
         if list_stats:
             ac.panel_configs = file_io.top_stats_build_in_config
         else:
-            arch_panel_config = (
+            arch_panel_config = [
                 config_dir if single_panel_config else config_dir.joinpath(arch)
-            )
-            ac.panel_configs = file_io.load_panel_configs(arch_panel_config, {})
+            ]
+            # Use restructured perf metrics in TUI analyze mode
+            if self.__args.tui and arch in ["gfx942", "gfx950"]:
+                arch_panel_config.append(
+                    f"{config.rocprof_compute_home}/rocprof_compute_tui/utils/{arch}"
+                )
+            ac.panel_configs = file_io.load_panel_configs(arch_panel_config)
 
         # TODO: filter_metrics should/might be one per arch
         # print(ac)
 
-        parser.build_dfs(archConfigs=ac, filter_metrics=filter_metrics, sys_info=sys_info)
+        parser.build_dfs(
+            archConfigs=ac, filter_metrics=filter_metrics, sys_info=sys_info
+        )
         self._arch_configs[arch] = ac
         return self._arch_configs
 
@@ -192,7 +199,9 @@ def initalize_runs(self, normalization_filter=None):
             arch = w.sys_info.iloc[0]["gpu_arch"]
             mspec = self.get_socs()[arch]._mspec
             if self.__args.specs_correction:
-                w.sys_info = parser.correct_sys_info(mspec, self.__args.specs_correction)
+                w.sys_info = parser.correct_sys_info(
+                    mspec, self.__args.specs_correction
+                )
             w.avail_ips = w.sys_info["ip_blocks"].item().split("|")
             w.dfs = copy.deepcopy(self._arch_configs[arch].dfs)
             w.dfs_type = self._arch_configs[arch].dfs_type
@@ -266,7 +275,9 @@ def pre_processing(self):
         console_log("analysis", "deriving rocprofiler-compute metrics...")
         # initalize output file
         self._output = (
-            open(self.__args.output_file, "w+") if self.__args.output_file else sys.stdout
+            open(self.__args.output_file, "w+")
+            if self.__args.output_file
+            else sys.stdout
         )
 
         # Read profiling config
 
@@ -0,0 +1,103 @@
+# TUI use only
+# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization
+Panel Config:
+  id: 3200
+  title: GPU Speed-of-Light
+  metrics_description:
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
+      \ interface per unit time. This is also presented as a percent of the peak theoretical\
+      \ bandwidth achievable on the specific accelerator."
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    Kernel Time: The total duration of the executed kernel.
+    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
+    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
+      on a CU was actively doing any work, summed over all CUs. Low values (less than
+      100%) indicate that the accelerator was not fully saturated by the kernel, or
+      a potential load-imbalance issue.
+    Clock Rate:
+  data source:
+  - metric_table:
+      id: 3201
+      title: GPU Speed-of-Light
+      header:
+        metric: Metric
+        value: Avg
+        unit: Unit
+        peak: Peak
+        pop: Pct of Peak
+      metric:
+        Theoretical LDS Bandwidth:
+          value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: (($max_sclk * $cu_per_gpu) * 0.128)
+          pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128)))
+        vL1D Cache BW:
+          value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
+          pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
+            - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
+        L2 Cache BW:
+          value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
+          pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
+            / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
+        L2-Fabric Read BW:
+          value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
+            - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
+            - Start_Timestamp))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
+            - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
+            - Start_Timestamp)))) / $hbmBandwidth)
+        L2-Fabric Write BW:
+          value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
+            TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) /
+            $hbmBandwidth)
+        Kernel Time:
+          avg: AVG((End_Timestamp - Start_Timestamp))
+          unit: ns
+          peak: None
+          pop: None
+        Kernel Time (Cycles):
+          avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
+          unit: Cycle
+          peak: None
+          pop: None
+        SIMD Utilization:
+          value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
+          unit: Pct
+          peak: 100
+          pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
+        Clock Rate:
+          value: None
+          unit: ns
+          peak: None
+          pop: None
@@ -0,0 +1,162 @@
+# TUI use only
+# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organizationPanel Config:
+  id: 3300
+  title: Compute Throughput
+  metrics_description:
+    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.'
+    VALU IOPs: 'The total integer operations executed per second on the VALU. This
+      is also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.'
+    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
+      executed per second. This does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F8 MFMA operations achievable on the specific accelerator. It is supported on
+      AMD Instinct MI300 series and later only.
+    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
+      executed per second. Note: this does not include any 16-bit brain floating point
+      operations from VALU instructions. This is also presented as a percent of the
+      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
+    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.'
+    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
+      per second. Note: this does not include any 32-bit floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.'
+    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
+      per second. Note: this does not include any 64-bit floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.'
+    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
+      per second. Note: this does not include any 8-bit integer operations from VALU
+      instructions. This is also presented as a percent of the peak theoretical INT8
+      MFMA operations achievable on the specific accelerator.'
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles the MFMA was busy over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics) for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+  data source:
+  - metric_table:
+      id: 3301
+      title: Compute Throughput
+      header:
+        metric: Metric
+        value: Avg
+        unit: Unit
+        peak: Peak
+        pop: Pct of Peak
+      metric:
+        VALU FLOPs:
+          value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) +
+            SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
+            + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
+            + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
+            + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
+            + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
+            + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
+            + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
+            + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
+            / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
+        VALU IOPs:
+          value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
+            - Start_Timestamp)))
+          unit: GIOP/s
+          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
+            - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
+        MFMA FLOPs (F8):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+        MFMA FLOPs (BF16):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
+            - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
+        MFMA FLOPs (F16):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
+        MFMA FLOPs (F32):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
+        MFMA FLOPs (F64):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
+        MFMA IOPs (Int8):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GIOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+        SALU Utilization:
+          value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: pct
+          peak: 100
+          pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+        VALU Utilization:
+          value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: pct
+          peak: 100
+          pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+        MFMA Utilization:
+          value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
+            * $cu_per_gpu) * 4)))
+          unit: pct
+          peak: 100
+          pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
+            * $cu_per_gpu) * 4)))
+        VMEM Utilization:
+          value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
+            / $cu_per_gpu))
+          unit: pct
+          peak: 100
+          pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
+            / $cu_per_gpu))
+        Branch Utilization:
+          value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+          unit: pct
+          peak: 100
+          pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+        IPC:
+          value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
+          unit: Instr/cycle
+          peak: 5
+          pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)