From 1716c942fccf1d87f5be33da1061659177e1aff8 Mon Sep 17 00:00:00 2001 From: benrichard-amd Date: Fri, 6 Mar 2026 09:19:42 -0600 Subject: [PATCH 1/9] Use gfx950 builtin for MFMA FP16 --- .../rocprofiler-compute/src/utils/benchmark.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/projects/rocprofiler-compute/src/utils/benchmark.py b/projects/rocprofiler-compute/src/utils/benchmark.py index da9e87553b8..2fac5ff3792 100644 --- a/projects/rocprofiler-compute/src/utils/benchmark.py +++ b/projects/rocprofiler-compute/src/utils/benchmark.py @@ -143,7 +143,7 @@ "F6": {"gfx950": 131072}, "F6F4": {"gfx950": 131072}, # Mixed precision F6 x F4 "F8": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 32768), - "F16": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 16384), + "F16": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942"], 16384) | dict.fromkeys(["gfx950"], 32768), "F32": dict.fromkeys( ["gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 4096 ), @@ -749,15 +749,24 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: extern "C" __global__ void mfma_f16(int iter, float *dummy) { - vec4<__fp16> a; - a[1] = a[0] = threadIdx.x; - vec16 result = {0}; +#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942___) + vec4<__fp16> a; + a[1] = a[0] = threadIdx.x; for(int i = 0; i < iter; ++i) { result = __builtin_amdgcn_mfma_f32_32x32x8f16(a, a, result, 0, 0, 0); } +#elif defined(__gfx950__) + vec8<__fp16> a; + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, a, result, 0, 0, 0); + } +#else +#error "Unsupported gfx arch" +#endif if (result[0] != 2*result[0]) { From b3216fb9167e6631d936d2949a1164e2961f81d3 Mon Sep 17 00:00:00 2001 From: benrichard-amd Date: Fri, 6 Mar 2026 09:33:48 -0600 Subject: [PATCH 2/9] Use gfx950 builtin for MFMA BF16 --- .../src/utils/benchmark.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/projects/rocprofiler-compute/src/utils/benchmark.py b/projects/rocprofiler-compute/src/utils/benchmark.py index 2fac5ff3792..5788c8bb7da 100644 --- a/projects/rocprofiler-compute/src/utils/benchmark.py +++ b/projects/rocprofiler-compute/src/utils/benchmark.py @@ -147,8 +147,8 @@ "F32": dict.fromkeys( ["gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 4096 ), - "BF16": dict.fromkeys(["gfx940", "gfx941", "gfx942", "gfx950"], 16384) - | dict.fromkeys(["gfx90a"], 8192), + "BF16": dict.fromkeys(["gfx940", "gfx941", "gfx942"], 16384) + | dict.fromkeys(["gfx90a"], 8192) | dict.fromkeys(["gfx950"], 32768), "I8": dict.fromkeys(["gfx940", "gfx941", "gfx942", "gfx950"], 32768) | dict.fromkeys(["gfx90a"], 16384), "F64": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 2048), @@ -785,7 +785,7 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: vec16 result = {0}; // MI100/MI200 -#if defined(__gfx908__) or defined(__gfx90a__) +#if defined(__gfx908__) || defined(__gfx90a__) vec2 a; a[1] = a[0]= threadIdx.x; @@ -794,7 +794,7 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: result = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, a, result, 0, 0, 0); } //MI300 series -#else +#elif defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) vec4 a; a[3] = a[2] = a[1] = a[0] = threadIdx.x; @@ -802,6 +802,16 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: { result = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, a, result, 0, 0, 0); } +#elif defined(__gfx950__) + vec8 a; + a[3] = a[2] = a[1] = a[0] = threadIdx.x; + + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, a, result, 0, 0, 0); + } +#else +#error "Unsupported gfx arch" #endif if (result[0] != 2*result[0]) @@ -844,7 +854,7 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: vec16 result = {0}; // MI100/MI200 -#if defined(__gfx908__) or defined(__gfx90a__) +#if defined(__gfx908__) || defined(__gfx90a__) int a = threadIdx.x; for(int i = 0; i < iter; ++i) From 9db42af04447a497c51c31e90143bd06761154cc Mon Sep 17 00:00:00 2001 From: benrichard-amd Date: Fri, 6 Mar 2026 10:06:31 -0600 Subject: [PATCH 3/9] Use gfx950 builtin for MFMA I8 --- .../src/utils/benchmark.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/projects/rocprofiler-compute/src/utils/benchmark.py b/projects/rocprofiler-compute/src/utils/benchmark.py index 5788c8bb7da..f6bcc33cd5c 100644 --- a/projects/rocprofiler-compute/src/utils/benchmark.py +++ b/projects/rocprofiler-compute/src/utils/benchmark.py @@ -149,8 +149,8 @@ ), "BF16": dict.fromkeys(["gfx940", "gfx941", "gfx942"], 16384) | dict.fromkeys(["gfx90a"], 8192) | dict.fromkeys(["gfx950"], 32768), - "I8": dict.fromkeys(["gfx940", "gfx941", "gfx942", "gfx950"], 32768) - | dict.fromkeys(["gfx90a"], 16384), + "I8": dict.fromkeys(["gfx940", "gfx941", "gfx942"], 32768) + | dict.fromkeys(["gfx90a"], 16384) | dict.fromkeys(["gfx950"], 65536), "F64": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 2048), } @@ -760,6 +760,7 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: } #elif defined(__gfx950__) vec8<__fp16> a; + a[7] = a[6] = a[5] = a[4] = a[3] = a[2] = a[1] = a[0] = threadIdx.x; for(int i = 0; i < iter; ++i) { result = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, a, result, 0, 0, 0); @@ -804,7 +805,7 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: } #elif defined(__gfx950__) vec8 a; - a[3] = a[2] = a[1] = a[0] = threadIdx.x; + a[7] = a[6] = a[5] = a[4] = a[3] = a[2] = a[1] = a[0] = threadIdx.x; for(int i = 0; i < iter; ++i) { @@ -862,13 +863,23 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: result = __builtin_amdgcn_mfma_i32_32x32x8i8(a, a, result, 0, 0, 0); } // MI300 series -#else +#elif defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) long a = threadIdx.x; for(int i = 0; i < iter; ++i) { result = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, a, result, 0, 0, 0); } +#elif defined(__gfx950__) + vec2 a; + a[1] = a[0] = threadIdx.x; + + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, a, result, 0, 0, 0); + } +#else +#error "Unsupported gfx arch" #endif if (result[0] != 2*result[0]) From 18cd682f382850df9792d369b219f917a0bd13b9 Mon Sep 17 00:00:00 2001 From: benrichard-amd Date: Fri, 6 Mar 2026 10:53:44 -0600 Subject: [PATCH 4/9] Fix comments --- projects/rocprofiler-compute/src/utils/benchmark.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/projects/rocprofiler-compute/src/utils/benchmark.py b/projects/rocprofiler-compute/src/utils/benchmark.py index f6bcc33cd5c..a25fa1db5e8 100644 --- a/projects/rocprofiler-compute/src/utils/benchmark.py +++ b/projects/rocprofiler-compute/src/utils/benchmark.py @@ -726,7 +726,7 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: extern "C" __global__ void mfma_f32(int iter, float *dummy) { - float a = threadIdx.x; + float a = threadIdx.x; vec16 result = {0}; for(int i = 0; i < iter; ++i) @@ -750,7 +750,6 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: extern "C" __global__ void mfma_f16(int iter, float *dummy) { vec16 result = {0}; - #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942___) vec4<__fp16> a; a[1] = a[0] = threadIdx.x; @@ -794,7 +793,7 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: { result = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, a, result, 0, 0, 0); } -//MI300 series +// MI300 series #elif defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) vec4 a; a[3] = a[2] = a[1] = a[0] = threadIdx.x; @@ -803,6 +802,7 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: { result = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, a, result, 0, 0, 0); } +// MI350 #elif defined(__gfx950__) vec8 a; a[7] = a[6] = a[5] = a[4] = a[3] = a[2] = a[1] = a[0] = threadIdx.x; @@ -870,6 +870,7 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: { result = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, a, result, 0, 0, 0); } +// MI350 series #elif defined(__gfx950__) vec2 a; a[1] = a[0] = threadIdx.x; From 754cf4ced9588a2889f9f06e7f083cfc572954e8 Mon Sep 17 00:00:00 2001 From: benrichard-amd Date: Fri, 6 Mar 2026 10:59:43 -0600 Subject: [PATCH 5/9] Update copyright --- projects/rocprofiler-compute/src/utils/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/rocprofiler-compute/src/utils/benchmark.py b/projects/rocprofiler-compute/src/utils/benchmark.py index a25fa1db5e8..bad14a7b54e 100644 --- a/projects/rocprofiler-compute/src/utils/benchmark.py +++ b/projects/rocprofiler-compute/src/utils/benchmark.py @@ -1,7 +1,7 @@ ############################################################################## # MIT License # -# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal From c9873a639840e7167a833030cb2baee822a1dec4 Mon Sep 17 00:00:00 2001 From: benrichard-amd Date: Fri, 6 Mar 2026 11:03:36 -0600 Subject: [PATCH 6/9] Update CHANGELOG --- projects/rocprofiler-compute/CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index f3dd802da70..2f90bbfd1d7 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -55,6 +55,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Processes attempting to benchmark on the same GPU will wait with user-visible feedback and execute sequentially. * Lock applies specifically to the roofline.csv file generated during benchmarking, not other files generated in profile mode. +* Added proper support for `gfx950` in roofline benchmark. + * Missing metric descriptions for gfx950 and gfx942 architecture. * Added `--membw-analysis` under experimental features to allow memory bandwidth specific profiling and analysis with metric block 30. From 1515ee706d6eb3fa853072a03e17702d0dc8fba1 Mon Sep 17 00:00:00 2001 From: benrichard-amd Date: Fri, 6 Mar 2026 11:29:07 -0600 Subject: [PATCH 7/9] Fix formatting --- projects/rocprofiler-compute/src/utils/benchmark.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/projects/rocprofiler-compute/src/utils/benchmark.py b/projects/rocprofiler-compute/src/utils/benchmark.py index bad14a7b54e..e61bf1d24eb 100644 --- a/projects/rocprofiler-compute/src/utils/benchmark.py +++ b/projects/rocprofiler-compute/src/utils/benchmark.py @@ -143,14 +143,17 @@ "F6": {"gfx950": 131072}, "F6F4": {"gfx950": 131072}, # Mixed precision F6 x F4 "F8": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 32768), - "F16": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942"], 16384) | dict.fromkeys(["gfx950"], 32768), + "F16": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942"], 16384) + | dict.fromkeys(["gfx950"], 32768), "F32": dict.fromkeys( ["gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 4096 ), "BF16": dict.fromkeys(["gfx940", "gfx941", "gfx942"], 16384) - | dict.fromkeys(["gfx90a"], 8192) | dict.fromkeys(["gfx950"], 32768), + | dict.fromkeys(["gfx90a"], 8192) + | dict.fromkeys(["gfx950"], 32768), "I8": dict.fromkeys(["gfx940", "gfx941", "gfx942"], 32768) - | dict.fromkeys(["gfx90a"], 16384) | dict.fromkeys(["gfx950"], 65536), + | dict.fromkeys(["gfx90a"], 16384) + | dict.fromkeys(["gfx950"], 65536), "F64": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 2048), } From 90220a0d208445612721d866db6090b728cff178 Mon Sep 17 00:00:00 2001 From: Vignesh Edithal Date: Fri, 6 Mar 2026 21:40:27 +0000 Subject: [PATCH 8/9] Review comments * Clear CHANGELOG indicating we resolved roofline peaks for MI350 * Fix typo in pre-processor guard preventing roofline from running on MI300 * Ruff formatting --- projects/rocprofiler-compute/CHANGELOG.md | 4 ++-- projects/rocprofiler-compute/src/utils/benchmark.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 2f90bbfd1d7..debba73dcfd 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -16,6 +16,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. ### Resolved issues +* Fixed roofline benchmark MFMA FP16/BF16/INT8 peaks for MI 350 + ### Upcoming changes ## ROCm Compute Profiler 3.5.0 for ROCm 7.12.0 @@ -55,8 +57,6 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Processes attempting to benchmark on the same GPU will wait with user-visible feedback and execute sequentially. * Lock applies specifically to the roofline.csv file generated during benchmarking, not other files generated in profile mode. -* Added proper support for `gfx950` in roofline benchmark. - * Missing metric descriptions for gfx950 and gfx942 architecture. * Added `--membw-analysis` under experimental features to allow memory bandwidth specific profiling and analysis with metric block 30. diff --git a/projects/rocprofiler-compute/src/utils/benchmark.py b/projects/rocprofiler-compute/src/utils/benchmark.py index e61bf1d24eb..7373292a527 100644 --- a/projects/rocprofiler-compute/src/utils/benchmark.py +++ b/projects/rocprofiler-compute/src/utils/benchmark.py @@ -1,7 +1,7 @@ ############################################################################## # MIT License # -# Copyright (c) 2026 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright (c) 2025 - 2026 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -753,7 +753,8 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: extern "C" __global__ void mfma_f16(int iter, float *dummy) { vec16 result = {0}; -#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942___) +#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || \ + defined(__gfx941__) || defined(__gfx942__) vec4<__fp16> a; a[1] = a[0] = threadIdx.x; for(int i = 0; i < iter; ++i) From a10209433518e70aa408fcb3602fdfed492aa330 Mon Sep 17 00:00:00 2001 From: benrichard-amd Date: Fri, 6 Mar 2026 20:56:32 -0600 Subject: [PATCH 9/9] Fix uninitialized variables --- projects/rocprofiler-compute/src/utils/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/rocprofiler-compute/src/utils/benchmark.py b/projects/rocprofiler-compute/src/utils/benchmark.py index 7373292a527..531c63217cc 100644 --- a/projects/rocprofiler-compute/src/utils/benchmark.py +++ b/projects/rocprofiler-compute/src/utils/benchmark.py @@ -756,7 +756,7 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: #if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || \ defined(__gfx941__) || defined(__gfx942__) vec4<__fp16> a; - a[1] = a[0] = threadIdx.x; + a[3] = a[2] = a[1] = a[0] = threadIdx.x; for(int i = 0; i < iter; ++i) { result = __builtin_amdgcn_mfma_f32_32x32x8f16(a, a, result, 0, 0, 0);