[Proton][AMD] Fix peak TB/s and support gfx950 specs (#7175)

knwng · web-flow · commit d514243cb5d5 · 2025-06-23T17:22:36.000-07:00
Using `2 * bus_width * memory_clock_rate * 1e3 / 8`
as the formula cannot deduce the proper max TB/s
on AMD devices; the method is more involved on AMD.

For now we just hardcode the TB/s result to get correct
result and unblock supporting of gfx950.
diff --git a/python/triton_kernels/bench/bench_mlp.py b/python/triton_kernels/bench/bench_mlp.py
@@ -70,7 +70,8 @@ def opint(self):
 
     @property
     def max_tbps(self):
-        return proton.specs.max_bps(self.device_info["bus_width"], self.device_info["memory_clock_rate"]) * 1e-12
+        return proton.specs.max_bps(self.device_type, self.device_info["arch"], self.device_info["bus_width"],
+                                    self.device_info["memory_clock_rate"]) * 1e-12
 
     @property
     def max_tflops(self):
diff --git a/third_party/proton/proton/specs.py b/third_party/proton/proton/specs.py
@@ -9,13 +9,19 @@
         (width / 8),
         "100":
         lambda width, num_sms, clock_rate, **kwargs: (num_sms * 16384 * (clock_rate / 1e3) * 1e6) / (width / 8),
-    },
-    "HIP": {
-        "gfx90a": lambda width, **kwargs: 383e12 / (width / 8),
-        "gfx942": lambda width, **kwargs: 2614.9e12 / (width / 8),
-    },
+    }
 }
 
+amd_bps_by_arch = {
+    'gfx90a': 3.2 * 1e12,
+    'gfx942': 5.3 * 1e12,
+    'gfx950': 8.0 * 1e12,
+}
+
+# FP8 Matrix Performance(FLOPS/clock/CU)
+# For gfx90a we use the performance of INT8 since it doesn't support FP8 matrix operations.
+amd_fp8_flops_by_arch = {'gfx90a': 1024, 'gfx942': 4096, 'gfx950': 8192}
+
 
 def max_flops(device_type, arch, width, num_sms, clock_rate):
     """
@@ -31,6 +37,9 @@ def max_flops(device_type, arch, width, num_sms, clock_rate):
     Returns:
         float: The maximum FLOPS for the given device type and width.
     """
+    if device_type == "HIP":
+        return amd_fp8_flops_by_arch[arch] * num_sms * clock_rate * 1e3 / (width / 8)
+
     if device_type not in flops_by_device:
         raise ValueError(f"Unsupported device type: {device_type}")
 
@@ -42,7 +51,7 @@ def max_flops(device_type, arch, width, num_sms, clock_rate):
     return flops_func(width, num_sms=num_sms, clock_rate=clock_rate)
 
 
-def max_bps(bus_width, memory_clock_rate):
+def max_bps(device_type, arch, bus_width, memory_clock_rate):
     """
     Calculate the maximum bytes per second for a given bus width and memory clock rate.
 
@@ -53,4 +62,8 @@ def max_bps(bus_width, memory_clock_rate):
     Returns:
         float: The maximum bytes per second.
     """
-    return 2 * bus_width * memory_clock_rate * 1e3 / 8
+    if device_type == "CUDA":
+        return 2 * bus_width * memory_clock_rate * 1e3 / 8
+    else:
+        assert device_type == "HIP"
+        return amd_bps_by_arch[arch]
diff --git a/third_party/proton/proton/viewer.py b/third_party/proton/proton/viewer.py
@@ -96,9 +96,10 @@ def get_min_time_bytes(df, device_info):
         for device_index in device_info[device_type]:
             idx = df["device_id"] == device_index
             device_frames = df[idx]
-            memory_clock_rate = device_info[device_type][device_index]["memory_clock_rate"]  # in khz
-            bus_width = device_info[device_type][device_index]["bus_width"]  # in bits
-            peak_bandwidth = specs.max_bps(bus_width, memory_clock_rate)
+            device = device_info[device_type][device_index]
+            memory_clock_rate = device["memory_clock_rate"]  # in khz
+            bus_width = device["bus_width"]  # in bits
+            peak_bandwidth = specs.max_bps(device_type, device['arch'], bus_width, memory_clock_rate)
             min_time_bytes.loc[idx, "min_time"] += device_frames["bytes"] / peak_bandwidth
     return min_time_bytes
 
diff --git a/third_party/proton/test/examples/hip.json b/third_party/proton/test/examples/hip.json
@@ -30,6 +30,21 @@
           "flops8": 1e10,
           "bytes": 1e7
         }
+      },
+      {
+        "children": [],
+        "frame": {
+          "name": "foo2",
+          "type": "function"
+        },
+        "metrics": {
+          "count": 1,
+          "device_id": "2",
+          "device_type": "HIP",
+          "time (ns)": 204800,
+          "flops8": 1e12,
+          "bytes": 1e9
+        }
       }
     ],
     "frame": {
@@ -55,9 +70,16 @@
       "1": {
         "arch": "gfx942",
         "bus_width": 8192,
-        "clock_rate": 5200000,
-        "memory_clock_rate": 2525000,
+        "clock_rate": 2100000,
+        "memory_clock_rate": 1200000,
         "num_sms": 304
+      },
+      "2": {
+        "arch": "gfx950",
+        "bus_width": 8192,
+        "clock_rate": 2200000,
+        "memory_clock_rate": 1900000,
+        "num_sms": 256
       }
     }
   }
diff --git a/third_party/proton/test/test_viewer.py b/third_party/proton/test/test_viewer.py
@@ -101,10 +101,13 @@ def test_min_time_flops():
     ret = get_min_time_flops(gf.dataframe, device_info)
     device0_idx = gf.dataframe["device_id"] == "0"
     device1_idx = gf.dataframe["device_id"] == "1"
+    device2_idx = gf.dataframe["device_id"] == "2"
     # CDNA2
-    np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[0.000026]], atol=1e-5)
+    np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[0.000055]], atol=1e-5)
     # CDNA3
     np.testing.assert_allclose(ret[device1_idx].to_numpy(), [[0.000038]], atol=1e-5)
+    # CDNA4
+    np.testing.assert_allclose(ret[device2_idx].to_numpy(), [[0.000217]], atol=1e-5)
 
 
 def test_min_time_bytes():
@@ -120,10 +123,13 @@ def test_min_time_bytes():
     ret = get_min_time_bytes(gf.dataframe, device_info)
     device0_idx = gf.dataframe["device_id"] == "0"
     device1_idx = gf.dataframe["device_id"] == "1"
+    device2_idx = gf.dataframe["device_id"] == "2"
     # CDNA2
-    np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[6.10351e-06]], atol=1e-6)
+    np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[3.125e-06]], atol=1e-6)
     # CDNA3
     np.testing.assert_allclose(ret[device1_idx].to_numpy(), [[1.93378e-05]], atol=1e-6)
+    # CDNA4
+    np.testing.assert_allclose(ret[device2_idx].to_numpy(), [[0.000125]], atol=1e-6)
 
 
 def test_percentage():