[Kernels][GPU] Add BF16 FMA matmul test

mcgrof · mcgrof · commit 7403bec8e6c1 · 2025-10-17T17:51:02.000-07:00
Add BF16 FMA (Fused Multiply-Add) test to test_matmul.mojo that uses
scalar/vector FMA operations instead of tensor cores (enable_tc=False).

Update _has_gpu_bf16_fma() helper to include AMD RDNA GPUs. While RDNA3
hardware supports BF16 via v_wmma_* instructions, LLVM cannot lower these
intrinsics yet for FMA operations.
diff --git a/max/kernels/test/gpu/layout/test_matmul.mojo b/max/kernels/test/gpu/layout/test_matmul.mojo
@@ -12,7 +12,11 @@
 # ===----------------------------------------------------------------------=== #
 
 from sys import has_nvidia_gpu_accelerator
-from sys.info import _has_gpu_fp32_tensor_cores, _has_gpu_tensor_cores
+from sys.info import (
+    _has_gpu_bf16_fma,
+    _has_gpu_fp32_tensor_cores,
+    _has_gpu_tensor_cores,
+)
 
 from benchmark import Bench
 from buffer.dimlist import DimList
@@ -220,6 +224,23 @@ def main():
         test.run_test[k5](m)
         test.run_test[k6](m)
 
+        @parameter
+        if _has_gpu_bf16_fma():
+            var test_bf16_fma = test_matmul[
+                DType.bfloat16, a_layout, b_layout, c_layout, False
+            ](m, ctx)
+
+            alias k1_bf16 = run_gemm_kernel_1[
+                DType.bfloat16, a_layout, b_layout, c_layout, 32, 32
+            ]
+
+            test_bf16_fma.run_test[k1_bf16](m)
+        else:
+            print(
+                "Skipping BF16 FMA test (requires FP32 accumulation on this"
+                " GPU)"
+            )
+
         @parameter
         if _has_gpu_fp32_tensor_cores():
             test_tc.run_test[k_tc](m)
diff --git a/mojo/stdlib/stdlib/sys/info.mojo b/mojo/stdlib/stdlib/sys/info.mojo
@@ -775,33 +775,34 @@ fn _has_gpu_fp32_tensor_cores() -> Bool:
 
 @always_inline("nodebug")
 fn _has_gpu_bf16_fma() -> Bool:
-    """Returns True if the GPU supports BF16 outputs with FMA operations.
+    """Returns True if the GPU supports BF16 FMA operations.
 
-    This checks whether the GPU can perform BF16 × BF16 → BF16 operations
-    using scalar/vector FMA instructions (not tensor cores).
+    This checks whether the GPU can perform BF16 × BF16 operations using
+    scalar/vector FMA instructions (not tensor cores). On some platforms,
+    this may use FP32 emulation internally.
 
     Returns True for:
-    - NVIDIA GPUs (all architectures support BF16 FMA)
-    - AMD CDNA GPUs with MFMA (MI300X, MI355X)
+    - NVIDIA GPUs (all architectures support native BF16 FMA)
+    - AMD CDNA GPUs with MFMA (MI300X, MI355X - native BF16 support)
+    - AMD RDNA GPUs (RDNA3+ - emulated via FP32 accumulation)
     - Apple GPUs (M-series support BF16 operations)
 
-    Returns False for:
-    - AMD RDNA GPUs - these require FP32 accumulation for BF16 FMA.
-      BF16 outputs are only supported via WMMA (tensor cores), which
-      LLVM cannot lower yet. For FMA operations, RDNA requires
-      BF16 inputs with FP32 outputs.
+    Implementation notes:
+    - RDNA3 hardware supports BF16 via v_wmma_* instructions, but LLVM
+      cannot lower these intrinsics yet. For FMA operations, the compiler
+      automatically promotes BF16 to FP32, performs FP32 computation, then
+      converts back to BF16. This emulation provides correct results with
+      some performance overhead.
+    - CDNA uses native v_mfma_* instructions for BF16.
 
     Note:
         This is specifically for FMA (non-tensor-core) operations.
         For tensor core BF16 support, use _has_gpu_tensor_cores().
 
     Returns:
-        True if the GPU supports BF16 output with FMA operations.
+        True if the GPU supports BF16 FMA operations (native or emulated).
     """
-    # NVIDIA: All GPUs support BF16 FMA
-    # AMD: Only CDNA (MFMA) supports BF16 outputs; RDNA requires FP32 accumulation
-    # Apple: M-series GPUs support BF16 operations
-    return is_nvidia_gpu() or _has_amd_tensor_cores() or is_apple_gpu()
+    return is_nvidia_gpu() or has_amd_gpu_accelerator() or is_apple_gpu()
 
 
 @always_inline("nodebug")