Skip to content

Commit 1716c94

Browse files
benrichard-amdvedithal-amd
authored andcommitted
Use gfx950 builtin for MFMA FP16
1 parent 2501e5e commit 1716c94

File tree

1 file changed

+13
-4
lines changed

1 file changed

+13
-4
lines changed

projects/rocprofiler-compute/src/utils/benchmark.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@
143143
"F6": {"gfx950": 131072},
144144
"F6F4": {"gfx950": 131072}, # Mixed precision F6 x F4
145145
"F8": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 32768),
146-
"F16": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 16384),
146+
"F16": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942"], 16384) | dict.fromkeys(["gfx950"], 32768),
147147
"F32": dict.fromkeys(
148148
["gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 4096
149149
),
@@ -749,15 +749,24 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics:
749749
750750
extern "C" __global__ void mfma_f16(int iter, float *dummy)
751751
{
752-
vec4<__fp16> a;
753-
a[1] = a[0] = threadIdx.x;
754-
755752
vec16<float> result = {0};
756753
754+
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942___)
755+
vec4<__fp16> a;
756+
a[1] = a[0] = threadIdx.x;
757757
for(int i = 0; i < iter; ++i)
758758
{
759759
result = __builtin_amdgcn_mfma_f32_32x32x8f16(a, a, result, 0, 0, 0);
760760
}
761+
#elif defined(__gfx950__)
762+
vec8<__fp16> a;
763+
for(int i = 0; i < iter; ++i)
764+
{
765+
result = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, a, result, 0, 0, 0);
766+
}
767+
#else
768+
#error "Unsupported gfx arch"
769+
#endif
761770
762771
if (result[0] != 2*result[0])
763772
{

0 commit comments

Comments
 (0)