|
143 | 143 | "F6": {"gfx950": 131072}, |
144 | 144 | "F6F4": {"gfx950": 131072}, # Mixed precision F6 x F4 |
145 | 145 | "F8": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 32768), |
146 | | - "F16": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 16384), |
| 146 | + "F16": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942"], 16384) | dict.fromkeys(["gfx950"], 32768), |
147 | 147 | "F32": dict.fromkeys( |
148 | 148 | ["gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 4096 |
149 | 149 | ), |
@@ -749,15 +749,24 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: |
749 | 749 |
|
750 | 750 | extern "C" __global__ void mfma_f16(int iter, float *dummy) |
751 | 751 | { |
752 | | - vec4<__fp16> a; |
753 | | - a[1] = a[0] = threadIdx.x; |
754 | | - |
755 | 752 | vec16<float> result = {0}; |
756 | 753 |
|
| 754 | +#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942___) |
| 755 | + vec4<__fp16> a; |
| 756 | + a[1] = a[0] = threadIdx.x; |
757 | 757 | for(int i = 0; i < iter; ++i) |
758 | 758 | { |
759 | 759 | result = __builtin_amdgcn_mfma_f32_32x32x8f16(a, a, result, 0, 0, 0); |
760 | 760 | } |
| 761 | +#elif defined(__gfx950__) |
| 762 | + vec8<__fp16> a; |
| 763 | + for(int i = 0; i < iter; ++i) |
| 764 | + { |
| 765 | + result = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, a, result, 0, 0, 0); |
| 766 | + } |
| 767 | +#else |
| 768 | +#error "Unsupported gfx arch" |
| 769 | +#endif |
761 | 770 |
|
762 | 771 | if (result[0] != 2*result[0]) |
763 | 772 | { |
|
0 commit comments