[rocprofiler-compute] Roofline: Use gfx950 builtins for MFMA (#3886)

vedithal-amd · benrichard-amd · vedithal-amd · commit 4380c1987747 · 2026-03-10T20:54:24.000Z
* Use gfx950 builtin for MFMA FP16

* Use gfx950 builtin for MFMA BF16

* Use gfx950 builtin for MFMA I8

* Fix comments

* Update copyright

* Update CHANGELOG

* Fix formatting

* Review comments

* Clear CHANGELOG indicating we resolved roofline peaks for MI350

* Fix typo in pre-processor guard preventing roofline from running on
  MI300

* Ruff formatting

* Fix uninitialized variables

---------

Co-authored-by: benrichard-amd &lt;ben.richard@amd.com&gt;
diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md
@@ -16,6 +16,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
 
 ### Resolved issues
 
+* Fixed roofline benchmark MFMA FP16/BF16/INT8 peaks for MI 350
+
 ### Upcoming changes
 
 ## ROCm Compute Profiler 3.5.0 for ROCm 7.12.0
diff --git a/projects/rocprofiler-compute/src/utils/benchmark.py b/projects/rocprofiler-compute/src/utils/benchmark.py
@@ -1,7 +1,7 @@
 ##############################################################################
 # MIT License
 #
-# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+# Copyright (c) 2025 - 2026 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -143,14 +143,17 @@
     "F6": {"gfx950": 131072},
     "F6F4": {"gfx950": 131072},  # Mixed precision F6 x F4
     "F8": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 32768),
-    "F16": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 16384),
+    "F16": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942"], 16384)
+    | dict.fromkeys(["gfx950"], 32768),
     "F32": dict.fromkeys(
         ["gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 4096
     ),
-    "BF16": dict.fromkeys(["gfx940", "gfx941", "gfx942", "gfx950"], 16384)
-    | dict.fromkeys(["gfx90a"], 8192),
-    "I8": dict.fromkeys(["gfx940", "gfx941", "gfx942", "gfx950"], 32768)
-    | dict.fromkeys(["gfx90a"], 16384),
+    "BF16": dict.fromkeys(["gfx940", "gfx941", "gfx942"], 16384)
+    | dict.fromkeys(["gfx90a"], 8192)
+    | dict.fromkeys(["gfx950"], 32768),
+    "I8": dict.fromkeys(["gfx940", "gfx941", "gfx942"], 32768)
+    | dict.fromkeys(["gfx90a"], 16384)
+    | dict.fromkeys(["gfx950"], 65536),
     "F64": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 2048),
 }
 
@@ -726,7 +729,7 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics:
 
 extern "C" __global__ void mfma_f32(int iter, float *dummy)
 {
-    float a =  threadIdx.x;
+    float a = threadIdx.x;
     vec16<float> result = {0};
 
     for(int i = 0; i < iter; ++i)
@@ -749,15 +752,25 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics:
 
 extern "C" __global__ void mfma_f16(int iter, float *dummy)
 {
-    vec4<__fp16> a;
-    a[1] = a[0] = threadIdx.x;
-    
     vec16<float> result = {0};
-
+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || \
+    defined(__gfx941__) || defined(__gfx942__)
+    vec4<__fp16> a;
+    a[3] = a[2] = a[1] = a[0] = threadIdx.x;
     for(int i = 0; i < iter; ++i)
     {
         result = __builtin_amdgcn_mfma_f32_32x32x8f16(a, a, result, 0, 0, 0);
     }
+#elif defined(__gfx950__)
+    vec8<__fp16> a;
+    a[7] = a[6] = a[5] = a[4] = a[3] = a[2] = a[1] = a[0] = threadIdx.x;
+    for(int i = 0; i < iter; ++i)
+    {
+        result = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, a, result, 0, 0, 0);
+    }
+#else
+#error "Unsupported gfx arch"
+#endif
 
     if (result[0] != 2*result[0])
     {
@@ -776,23 +789,34 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics:
     vec16<float> result = {0};
 
 // MI100/MI200
-#if defined(__gfx908__) or defined(__gfx90a__)
+#if defined(__gfx908__) || defined(__gfx90a__)
     vec2<short> a;
     a[1] = a[0]= threadIdx.x;
 
     for(int i = 0; i < iter; ++i)
     {
         result = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, a, result, 0, 0, 0);
     }
-//MI300 series
-#else
+// MI300 series
+#elif defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
     vec4<short> a;
     a[3] = a[2] = a[1] = a[0] = threadIdx.x;
 
     for(int i = 0; i < iter; ++i)
     {
         result = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, a, result, 0, 0, 0);
     }
+// MI350
+#elif defined(__gfx950__)
+    vec8<short> a;
+    a[7] = a[6] = a[5] = a[4] = a[3] = a[2] = a[1] = a[0] = threadIdx.x;
+
+    for(int i = 0; i < iter; ++i)
+    {
+        result = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, a, result, 0, 0, 0);
+    }
+#else
+#error "Unsupported gfx arch"
 #endif
 
     if (result[0] != 2*result[0])
@@ -835,21 +859,32 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics:
     vec16<int> result = {0};
 
 // MI100/MI200
-#if defined(__gfx908__) or defined(__gfx90a__)
+#if defined(__gfx908__) || defined(__gfx90a__)
     int a = threadIdx.x;
 
     for(int i = 0; i < iter; ++i)
     {
         result = __builtin_amdgcn_mfma_i32_32x32x8i8(a, a, result, 0, 0, 0);
     }
 // MI300 series
-#else
+#elif defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
     long a =  threadIdx.x;
 
     for(int i = 0; i < iter; ++i)
     {
         result = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, a, result, 0, 0, 0);
     }
+// MI350 series
+#elif defined(__gfx950__)
+    vec2<long> a;
+    a[1] = a[0] = threadIdx.x;
+
+    for(int i = 0; i < iter; ++i)
+    {
+        result = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, a, result, 0, 0, 0);
+    }
+#else
+#error "Unsupported gfx arch"
 #endif
 
     if (result[0] != 2*result[0])