Add new 0200 yaml.

xuchen-amd · xuchen-amd · commit 7bab618eed0a · 2025-08-09T22:04:47.000-05:00
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py
@@ -73,7 +73,7 @@ def generate_configs(self, arch, config_dir, list_stats, filter_metrics, sys_inf
             arch_panel_config = (
                 config_dir if single_panel_config else config_dir.joinpath(arch)
             )
-            ac.panel_configs = file_io.load_panel_configs(arch_panel_config)
+            ac.panel_configs = file_io.load_panel_configs(arch_panel_config, {})
 
         # TODO: filter_metrics should/might be one per arch
         # print(ac)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_base.py b/projects/rocprofiler-compute/src/rocprof_compute_base.py
@@ -226,7 +226,7 @@ def list_metrics(self):
         if arch in self.__supported_archs.keys():
             ac = schema.ArchConfig()
             ac.panel_configs = file_io.load_panel_configs(
-                self.__args.config_dir.joinpath(arch)
+                self.__args.config_dir.joinpath(arch), {}
             )
             sys_info = self.__mspec.get_class_members().iloc[0]
             parser.build_dfs(archConfigs=ac, filter_metrics=[], sys_info=sys_info)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942_0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_tui/utils/gfx942_0200_system_speed_of_light.yaml
@@ -0,0 +1,103 @@
+# TUI use only
+# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization
+Panel Config:
+  id: 200
+  title: System Speed-of-Light
+  metrics_description:
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
+      \ interface per unit time. This is also presented as a percent of the peak theoretical\
+      \ bandwidth achievable on the specific accelerator."
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    Kernel Time: The total duration of the executed kernel.
+    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
+    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
+      on a CU was actively doing any work, summed over all CUs. Low values (less than
+      100%) indicate that the accelerator was not fully saturated by the kernel, or
+      a potential load-imbalance issue.
+    Clock Rate:
+  data source:
+  - metric_table:
+      id: 201
+      title: System Speed-of-Light
+      header:
+        metric: Metric
+        value: Avg
+        unit: Unit
+        peak: Peak
+        pop: Pct of Peak
+      metric:
+        Theoretical LDS Bandwidth:
+          value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: (($max_sclk * $cu_per_gpu) * 0.128)
+          pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128)))
+        vL1D Cache BW:
+          value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
+          pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
+            - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
+        L2 Cache BW:
+          value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
+          pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
+            / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
+        L2-Fabric Read BW:
+          value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
+            - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
+            - Start_Timestamp))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
+            - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
+            - Start_Timestamp)))) / $hbmBandwidth)
+        L2-Fabric Write BW:
+          value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
+            TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) /
+            $hbmBandwidth)
+        Kernel Time:
+          avg: AVG((End_Timestamp - Start_Timestamp))
+          unit: ns
+          peak: None
+          pop: None
+        Kernel Time (Cycles):
+          avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
+          unit: Cycle
+          peak: None
+          pop: None
+        SIMD Utilization:
+          value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
+          unit: Pct
+          peak: 100
+          pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
+        Clock Rate:
+          value: None
+          unit: ns
+          peak: None
+          pop: None
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_tui/views/main_view.py b/projects/rocprofiler-compute/src/rocprof_compute_tui/views/main_view.py
@@ -108,57 +108,48 @@ def run_analysis(self) -> None:
         self.top_kernel_to_df_list = []
 
         if not self.selected_path:
-            try:
-                self.app.call_from_thread(
-                    lambda: self.query_one("#kernel-view").update_view(
-                        "No directory selected for analysis", LogLevel.ERROR
-                    )
+            self.app.call_from_thread(
+                lambda: self.query_one("#kernel-view").update_view(
+                    "No directory selected for analysis", LogLevel.ERROR
                 )
-            except:
-                pass
+            )
             return
 
         try:
             self.logger.info(f"Starting analysis on: {self.selected_path}")
-            try:
-                self.app.call_from_thread(
-                    lambda: self.query_one("#kernel-view").update_view(
-                        f"Running analysis on: {self.selected_path}", LogLevel.SUCCESS
-                    )
+
+            self.app.call_from_thread(
+                lambda: self.query_one("#kernel-view").update_view(
+                    f"Running analysis on: {self.selected_path}", LogLevel.SUCCESS
                 )
-            except:
-                pass
+            )
 
             # 1. Create and TUI analyzer
             analyzer = tui_analysis(
                 self.app.args, self.app.supported_archs, self.selected_path
             )
             analyzer.sanitize()
 
-            # 2. Load and process system info
+            # 2. Load and process system info and Configure SoC
             sysinfo_path = Path(self.selected_path) / "sysinfo.csv"
             if not sysinfo_path.exists():
                 raise FileNotFoundError(f"sysinfo.csv not found at {sysinfo_path}")
-
             sys_info = file_io.load_sys_info(sysinfo_path).iloc[0].to_dict()
-
-            # 3. Configure SoC and run analysis
             self.app.load_soc_specs(sys_info)
+
+            # 3. run analysis
             analyzer.set_soc(self.app.soc)
             analyzer.pre_processing()
             self.kernel_to_df_dict = analyzer.run_kernel_analysis()
             self.top_kernel_to_df_list = analyzer.run_top_kernel()
 
             if not self.kernel_to_df_dict or not self.top_kernel_to_df_list:
-                try:
-                    self.app.call_from_thread(
-                        lambda: self.query_one("#kernel-view").update_view(
-                            "Analysis completed but not all data was returned",
-                            LogLevel.WARNING,
-                        )
+                self.app.call_from_thread(
+                    lambda: self.query_one("#kernel-view").update_view(
+                        "Analysis completed but not all data was returned",
+                        LogLevel.WARNING,
                     )
-                except:
-                    pass
+                )
             else:
                 self.app.call_from_thread(self.refresh_results)
                 self.logger.info("Kernel Analysis completed successfully")
@@ -169,30 +160,22 @@ def run_analysis(self) -> None:
 
             error_msg = f"Analysis failed: {str(e)}"
             self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
-            try:
-                self.app.call_from_thread(
-                    lambda: self.query_one("#kernel-view").update_view(
-                        error_msg, LogLevel.ERROR
-                    )
+            self.app.call_from_thread(
+                lambda: self.query_one("#kernel-view").update_view(
+                    error_msg, LogLevel.ERROR
                 )
-            except:
-                pass
+            )
 
     def refresh_results(self) -> None:
-        try:
-            kernel_view = self.query_one("#kernel-view")
-            if kernel_view and self.kernel_to_df_dict and self.top_kernel_to_df_list:
-                kernel_view.update_results(
-                    self.kernel_to_df_dict, self.top_kernel_to_df_list
-                )
-                self.logger.success("Results displayed successfully.")
-            else:
-                self.logger.error("Kernel view not found or no data available")
-        except Exception as e:
-            self.logger.error(f"Error refreshing results: {str(e)}")
+        kernel_view = self.query_one("#kernel-view")
+        if kernel_view:
+            kernel_view.update_results(self.kernel_to_df_dict, self.top_kernel_to_df_list)
+            self.logger.success("Results displayed successfully.")
+        else:
+            self.logger.error("Kernel view not found or no data available")
 
     def refresh_view(self) -> None:
-        if self.top_kernel_to_df_list:
+        if self.kernel_to_df_dict and self.top_kernel_to_df_list:
             self.refresh_results()
         else:
             self.logger.warning("No data available for refresh")
diff --git a/projects/rocprofiler-compute/src/utils/file_io.py b/projects/rocprofiler-compute/src/utils/file_io.py
@@ -62,14 +62,18 @@ def load_sys_info(f):
     return pd.read_csv(f)
 
 
-def load_panel_configs(dir):
+def load_panel_configs(dir, file_replacements={}):
     """
     Load all panel configs from yaml file.
     """
     d = {}
     for root, dirs, files in os.walk(dir):
         for f in files:
             if f.endswith(".yaml"):
+                # Use replacement filename if specified, otherwise use original
+                f = file_replacements.get(f, f)
+                print(f"{f}")
+                print(f"{str(Path(root).joinpath(f))}")
                 with open(str(Path(root).joinpath(f))) as file:
                     config = yaml.safe_load(file)
                     # metric key can be None due to some metric tables not having any metrics

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ def generate_configs(self, arch, config_dir, list_stats, filter_metrics, sys_inf`
`73`	`73`	`arch_panel_config = (`
`74`	`74`	`config_dir if single_panel_config else config_dir.joinpath(arch)`
`75`	`75`	`)`
`76`		`- ac.panel_configs = file_io.load_panel_configs(arch_panel_config)`
	`76`	`+ ac.panel_configs = file_io.load_panel_configs(arch_panel_config, {})`
`77`	`77`
`78`	`78`	`# TODO: filter_metrics should/might be one per arch`
`79`	`79`	`# print(ac)`
Original file line number	Diff line number	Diff line change
`@@ -226,7 +226,7 @@ def list_metrics(self):`
`226`	`226`	`if arch in self.__supported_archs.keys():`
`227`	`227`	`ac = schema.ArchConfig()`
`228`	`228`	`ac.panel_configs = file_io.load_panel_configs(`
`229`		`- self.__args.config_dir.joinpath(arch)`
	`229`	`+ self.__args.config_dir.joinpath(arch), {}`
`230`	`230`	`)`
`231`	`231`	`sys_info = self.__mspec.get_class_members().iloc[0]`
`232`	`232`	`parser.build_dfs(archConfigs=ac, filter_metrics=[], sys_info=sys_info)`