@@ -2919,6 +2919,20 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
29192919// the form: D = A * B + C.
29202920// A is sparse matrix, half the size of B, and is expanded using sparsity index.
29212921
2922+ class AMDGPUSWmmacIntrinsicIdxReuse<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
2923+ Intrinsic<
2924+ [CD], // %D
2925+ [
2926+ A, // %A
2927+ B, // %B
2928+ LLVMMatchType<0>, // %C
2929+ Index, // %Sparsity index for A
2930+ llvm_i1_ty, // matrix_a_reuse
2931+ llvm_i1_ty, // matrix_b_reuse
2932+ ],
2933+ [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]
2934+ >;
2935+
29222936class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
29232937 Intrinsic<
29242938 [CD], // %D
@@ -3602,6 +3616,161 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
36023616 [IntrNoMem, IntrSpeculatable]
36033617>;
36043618
3619+ // WMMA intrinsics.
3620+ class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> :
3621+ Intrinsic<
3622+ [CD], // %D
3623+ [
3624+ llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg
3625+ AB, // %A
3626+ llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg
3627+ LLVMMatchType<1>, // %B
3628+ LLVMMatchType<0>, // %C
3629+ llvm_i1_ty, // matrix_a_reuse
3630+ llvm_i1_ty, // matrix_b_reuse
3631+ ],
3632+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
3633+ IntrWillReturn, IntrNoCallback, IntrNoFree]
3634+ >;
3635+
3636+ class AMDGPUWmmaIntrinsicModsC<LLVMType AB, LLVMType CD> :
3637+ Intrinsic<
3638+ [CD], // %D
3639+ [
3640+ AB, // %A
3641+ LLVMMatchType<1>, // %B
3642+ llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
3643+ LLVMMatchType<0>, // %C
3644+ llvm_i1_ty, // matrix_a_reuse
3645+ llvm_i1_ty, // matrix_b_reuse
3646+ ],
3647+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>,
3648+ IntrWillReturn, IntrNoCallback, IntrNoFree]
3649+ >;
3650+
3651+ class AMDGPUWmmaIntrinsicF4ModsC<LLVMType A, LLVMType B, LLVMType CD> :
3652+ Intrinsic<
3653+ [CD], // %D
3654+ [
3655+ A, // %A
3656+ B, // %B
3657+ llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
3658+ LLVMMatchType<0>, // %C
3659+ ],
3660+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
3661+ >;
3662+
3663+ class AMDGPUWmmaIntrinsicModsAll<LLVMType AB, LLVMType CD> :
3664+ Intrinsic<
3665+ [CD], // %D
3666+ [
3667+ llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg
3668+ AB, // %A
3669+ llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg
3670+ LLVMMatchType<1>, // %B
3671+ llvm_i16_ty, // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs)
3672+ LLVMMatchType<0>, // %C
3673+ ],
3674+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
3675+ >;
3676+
3677+ class AMDGPUWmmaIntrinsicModsAllReuse<LLVMType AB, LLVMType CD> :
3678+ Intrinsic<
3679+ [CD], // %D
3680+ [
3681+ llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg
3682+ AB, // %A
3683+ llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg
3684+ LLVMMatchType<1>, // %B
3685+ llvm_i16_ty, // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs)
3686+ LLVMMatchType<0>, // %C
3687+ llvm_i1_ty, // matrix_a_reuse
3688+ llvm_i1_ty, // matrix_b_reuse
3689+ ],
3690+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
3691+ IntrWillReturn, IntrNoCallback, IntrNoFree]
3692+ >;
3693+
3694+ // D and C are of different types.
3695+ class AMDGPUWmmaIntrinsicModsAllDiff<LLVMType DstTy, LLVMType AB, LLVMType C> :
3696+ Intrinsic<
3697+ [DstTy], // %D
3698+ [
3699+ llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg
3700+ AB, // %A
3701+ llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg
3702+ LLVMMatchType<1>, // %B
3703+ llvm_i16_ty, // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs)
3704+ C, // %C
3705+ llvm_i1_ty, // matrix_a_reuse
3706+ llvm_i1_ty, // matrix_b_reuse
3707+ ],
3708+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
3709+ IntrWillReturn, IntrNoCallback, IntrNoFree]
3710+ >;
3711+
3712+ defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
3713+ def int_amdgcn_wmma_f64_16x16x4_f64 : AMDGPUWmmaIntrinsicModsAll<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3714+ def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3715+ def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3716+ def int_amdgcn_wmma_f32_16x16x32_f16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3717+ def int_amdgcn_wmma_f16_16x16x32_f16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3718+ def int_amdgcn_wmma_bf16_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3719+ def int_amdgcn_wmma_bf16f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllDiff<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty>;
3720+ def int_amdgcn_wmma_f32_16x16x64_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3721+ def int_amdgcn_wmma_f32_16x16x64_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3722+ def int_amdgcn_wmma_f32_16x16x64_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3723+ def int_amdgcn_wmma_f32_16x16x64_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3724+ def int_amdgcn_wmma_f16_16x16x64_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3725+ def int_amdgcn_wmma_f16_16x16x64_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3726+ def int_amdgcn_wmma_f16_16x16x64_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3727+ def int_amdgcn_wmma_f16_16x16x64_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3728+ def int_amdgcn_wmma_f16_16x16x128_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3729+ def int_amdgcn_wmma_f16_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3730+ def int_amdgcn_wmma_f16_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3731+ def int_amdgcn_wmma_f16_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3732+ def int_amdgcn_wmma_f32_16x16x128_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3733+ def int_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3734+ def int_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3735+ def int_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3736+ def int_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUWmmaIntrinsicModsAB<llvm_anyint_ty, llvm_anyint_ty>;
3737+ def int_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
3738+ }
3739+
3740+ class AMDGPUSWmmacIntrinsicABIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
3741+ Intrinsic<
3742+ [CD], // %D
3743+ [
3744+ llvm_i1_ty, // %A_mod: 0 - none, 1 - neg
3745+ A, // %A
3746+ llvm_i1_ty, // %B_mod: 0 - none, 1 - neg
3747+ B, // %B
3748+ LLVMMatchType<0>, // %C
3749+ Index, // %Sparsity index for A
3750+ llvm_i1_ty, // matrix_a_reuse
3751+ llvm_i1_ty, // matrix_b_reuse
3752+ ],
3753+ [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>]
3754+ >;
3755+
3756+ defset list<Intrinsic> AMDGPUSWMMACIntrinsicsGFX1250 = {
3757+ def int_amdgcn_swmmac_f32_16x16x64_f16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3758+ def int_amdgcn_swmmac_f32_16x16x64_bf16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3759+ def int_amdgcn_swmmac_f16_16x16x64_f16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3760+ def int_amdgcn_swmmac_bf16_16x16x64_bf16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3761+ def int_amdgcn_swmmac_bf16f32_16x16x64_bf16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3762+ def int_amdgcn_swmmac_f32_16x16x128_fp8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3763+ def int_amdgcn_swmmac_f32_16x16x128_fp8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3764+ def int_amdgcn_swmmac_f32_16x16x128_bf8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3765+ def int_amdgcn_swmmac_f32_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3766+ def int_amdgcn_swmmac_f16_16x16x128_fp8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3767+ def int_amdgcn_swmmac_f16_16x16x128_fp8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3768+ def int_amdgcn_swmmac_f16_16x16x128_bf8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3769+ def int_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3770+ def int_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
3771+ }
3772+
3773+
36053774class AMDGPUTensorLoadStore:
36063775 Intrinsic<
36073776 [],
0 commit comments