@@ -682,6 +682,128 @@ fn _is_amd_cdna() -> Bool:
682682 return _is_amd_mi300x() or _is_amd_mi355x()
683683
684684
685+ @always_inline (" nodebug" )
686+ fn _has_nvidia_tensor_cores () -> Bool:
687+ """ Returns True if the NVIDIA GPU has tensor core support.
688+
689+ Tensor cores were introduced in Volta (sm_70) architecture in 2017.
690+ Earlier architectures (Maxwell sm_50, Pascal sm_60/sm_61) do not have
691+ tensor core hardware.
692+
693+ Returns:
694+ True if the GPU is Volta (sm_70) or newer with tensor core support.
695+ """
696+ return is_nvidia_gpu() and not (
697+ is_nvidia_gpu[" sm_50" ]()
698+ or is_nvidia_gpu[" sm_60" ]()
699+ or is_nvidia_gpu[" sm_61" ]()
700+ )
701+
702+
703+ @always_inline (" nodebug" )
704+ fn _has_amd_tensor_cores () -> Bool:
705+ """ Returns True if the AMD GPU has MFMA / WMMA tensor core support.
706+
707+ AMD CDNA GPUs (MI300X, MI355X) use v_mfma_* instructions for tensor cores.
708+ AMD RDNA GPUs have v_wmma_* instructions but LLVM cannot lower them yet.
709+
710+ Returns:
711+ True if the GPU is CDNA with MFMA support.
712+ """
713+ return _is_amd_cdna()
714+
715+
716+ @always_inline (" nodebug" )
717+ fn _has_apple_tensor_cores () -> Bool:
718+ """ Returns True if the Apple GPU has matrix/tensor core support.
719+
720+ Apple M-series GPUs (M1/M2/M3/M4) support matrix operations through
721+ Metal Performance Shaders and SIMD matrix instructions. While not
722+ called "tensor cores", they provide similar matrix multiplication
723+ acceleration capabilities.
724+
725+ Returns:
726+ True if the GPU is Apple with matrix operation support.
727+ """
728+ return is_apple_gpu()
729+
730+
731+ @always_inline (" nodebug" )
732+ fn _has_gpu_tensor_cores () -> Bool:
733+ """ Returns True if the current GPU has tensor core support.
734+
735+ This is a vendor-agnostic check that returns True for:
736+ - NVIDIA GPUs with tensor cores (Volta/sm_70 and newer)
737+ - AMD CDNA GPUs with MFMA support (MI300X, MI355X)
738+ - Apple M-series GPUs (M1/M2/M3/M4 with matrix operations)
739+
740+ Returns False for:
741+ - NVIDIA Maxwell/Pascal (no tensor cores)
742+ - AMD RDNA (LLVM cannot lower v_wmma_* intrinsics yet)
743+
744+ Returns:
745+ True if the GPU has working tensor core support.
746+ """
747+ return (
748+ _has_nvidia_tensor_cores()
749+ or _has_amd_tensor_cores()
750+ or _has_apple_tensor_cores()
751+ )
752+
753+
754+ @always_inline (" nodebug" )
755+ fn _has_gpu_fp32_tensor_cores () -> Bool:
756+ """ Returns True if the GPU supports FP32 tensor core operations.
757+
758+ Checks whether the GPU supports FP32 × FP32 → FP32 matrix operations
759+ via tensor cores or equivalent hardware.
760+
761+ Returns True for:
762+ - NVIDIA GPUs with tensor cores (Volta/sm_70 and newer)
763+ - Apple M-series GPUs (support FP32 via Metal simdgroup_matrix)
764+
765+ Returns False for:
766+ - AMD RDNA/CDNA - only support lower-precision inputs (FP16/BF16/FP8/INT8)
767+ with FP32 accumulation, not FP32 × FP32 → FP32
768+ - NVIDIA Maxwell/Pascal (no tensor cores)
769+
770+ Returns:
771+ True if the GPU supports FP32 tensor core operations.
772+ """
773+ return _has_nvidia_tensor_cores() or _has_apple_tensor_cores()
774+
775+
776+ @always_inline (" nodebug" )
777+ fn _has_gpu_bf16_fma () -> Bool:
778+ """ Returns True if the GPU supports BF16 outputs with FMA operations.
779+
780+ This checks whether the GPU can perform BF16 × BF16 → BF16 operations
781+ using scalar/vector FMA instructions (not tensor cores).
782+
783+ Returns True for:
784+ - NVIDIA GPUs (all architectures support BF16 FMA)
785+ - AMD CDNA GPUs with MFMA (MI300X, MI355X)
786+ - Apple GPUs (M-series support BF16 operations)
787+
788+ Returns False for:
789+ - AMD RDNA GPUs - these require FP32 accumulation for BF16 FMA.
790+ BF16 outputs are only supported via WMMA (tensor cores), which
791+ LLVM cannot lower yet. For FMA operations, RDNA requires
792+ BF16 inputs with FP32 outputs.
793+
794+ Note:
795+ This is specifically for FMA (non-tensor-core) operations.
796+ For tensor core BF16 support, use _has_gpu_tensor_cores().
797+
798+ Returns:
799+ True if the GPU supports BF16 output with FMA operations.
800+ """
801+ # NVIDIA: All GPUs support BF16 FMA
802+ # AMD: Only CDNA (MFMA) supports BF16 outputs; RDNA requires FP32 accumulation
803+ # Apple: M-series GPUs support BF16 operations
804+ return is_nvidia_gpu() or _has_amd_tensor_cores() or is_apple_gpu()
805+
806+
685807@always_inline (" nodebug" )
686808fn is_amd_gpu () -> Bool:
687809 """ Returns True if the target triple of the compiler is `amdgcn-amd-amdhsa`
0 commit comments